
    +gdNS                       d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlZddlZddlmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ ddlm,Z- ddl.Z.ddl/Z0ddl1Z2ddl3Z4ddl5m6Z7 ddl8m9Z9m:Z: ddl;m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZDmEZE ddlFmGZG ddlHmIZI ddlJmKZKmLZLmMZMmNZNm,Z,mOZO ddlPmQZQmRZRmSZSmTZTmUZUmVZV ddlWmXZXmYZY ddlZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZd ddlemfZfmgZgmhZhmiZimjZj ddlkmlZlmmZm ddlnmoZompZp ddlqmrZr ddlsmtZt ddlumvZvmwZwmxZxmyZy dd lzm{Z{m|Z|m}Z}m~Z~mZmZmZmZmZmZ dd!lmZ dd"lmZ dd#lmZmZmZ dd$lmZ dd%lmZ dd&lmZ dd'lmZmZmZmZ dd(lmZ dd)lmZmZmZ dd*lmZ 	 dd+lmZ n# e$ r	 dd+lmZ Y nw xY we rddlZddlZddlZdd,lmZ dd-lmZ  ej        e          Z G d. d/          Z G d0 d1          Z G d2 d3e          Zd4 Zd5 Zd6e}d7eMfd8Zd9e}fd:Zd;e'e         fd<Zd= Z G d> d?e          Z G d@ dAeete          Z	 	 	 dWdBe'e         dCe(eo         dDe(ev         dEefdFZ	 	 	 	 	 dXdHe'dA         dIe(e'e                  dJe(e         dCe(eo         dDe(ev         dKe(e         d9dAfdLZdMedNedOed9efdPZ	 dYdQe#dRedSedTe(e*ee'e         f                  dUe(e}         f
dVZdS )Zz( Simple Dataset wrapping an Arrow Table.    N)Counter)Mapping)deepcopy)partialwraps)BytesIO)ceilfloorPath)sample)TYPE_CHECKINGAnyBinaryIOCallableDictIterableIteratorListOptionalTupleUnionoverload)Sequence)HfApiHfFolder)Pool)	HTTPError   )config)ArrowReader)ArrowWriterOptimizedTypedSequence)DownloadConfig)xgetsize)Audio
ClassLabelFeaturesImager   Value)FeatureType_align_features!_check_if_features_can_be_alignedgenerate_from_arrow_typepandas_types_mapperrequire_decoding)extract_path_from_uriis_remote_filesystem)
fingerprint_transformformat_kwargs_for_fingerprint format_transform_for_fingerprintgenerate_fingerprintgenerate_random_fingerprint#get_temporary_cache_files_directoryis_caching_enabled,maybe_register_dataset_for_temp_dir_deletionupdate_fingerprintvalidate_fingerprint)PythonFormatterformat_tableget_format_type_from_aliasget_formatterquery_table)LazyDict_is_range_contiguous)DatasetInfoDatasetInfosDict)	_split_re)IndexableMixin)
NamedSplitSplit	SplitDict	SplitInfo)
InMemoryTableMemoryMappedTableTablecast_array_to_featureconcat_tablesembed_table_storagelist_table_cache_files
table_cast
table_itertable_visitor)TaskTemplate)logging)_retrycached_pathestimate_dataset_size)
hf_hub_url)is_small_dataset)DatasetMetadata)asdictconvert_file_size_to_intiflatmap_unorderedunique_values))stratified_shuffle_split_generate_indices)dataset_to_tfminimal_tf_collate_fnmultiprocess_dataset_to_tf)PathLike)LiteralDatasetDict)IterableDatasetc                   $   e Zd ZdZdedee         fdZed             Z	ed             Z
edefd            Zedefd	            Zedefd
            Zedee         fd            Zedefd            Zedee         fd            Zedee         fd            Zedee         fd            Zedee         fd            Zedee         fd            Zedee         fd            Zed             Zed             Zed             ZdS )DatasetInfoMixinzqThis base class exposes some attributes of DatasetInfo
    at the base level of the Dataset for easy access.
    infosplitc                 "    || _         || _        d S N)_info_split)selfrm   rn   s      6lib/python3.11/site-packages/datasets/arrow_dataset.py__init__zDatasetInfoMixin.__init__   s    
    c                     | j         S )zL[`~datasets.DatasetInfo`] object containing all the metadata in the dataset.)rq   rs   s    rt   rm   zDatasetInfoMixin.info   s     zrv   c                     | j         S )zG[`~datasets.NamedSplit`] object corresponding to a named dataset split.)rr   rx   s    rt   rn   zDatasetInfoMixin.split   s     {rv   returnc                     | j         j        S rp   )rq   builder_namerx   s    rt   r|   zDatasetInfoMixin.builder_name       z&&rv   c                     | j         j        S rp   )rq   citationrx   s    rt   r   zDatasetInfoMixin.citation       z""rv   c                     | j         j        S rp   )rq   config_namerx   s    rt   r   zDatasetInfoMixin.config_name       z%%rv   c                     | j         j        S rp   )rq   dataset_sizerx   s    rt   r   zDatasetInfoMixin.dataset_size   r}   rv   c                     | j         j        S rp   )rq   descriptionrx   s    rt   r   zDatasetInfoMixin.description   r   rv   c                     | j         j        S rp   )rq   download_checksumsrx   s    rt   r   z#DatasetInfoMixin.download_checksums   s    z,,rv   c                     | j         j        S rp   )rq   download_sizerx   s    rt   r   zDatasetInfoMixin.download_size       z''rv   c                 Z    | j         j        | j         j                                        nd S rp   )rq   featurescopyrx   s    rt   r   zDatasetInfoMixin.features   s)    -1Z-@-Ltz"'')))RVVrv   c                     | j         j        S rp   )rq   homepagerx   s    rt   r   zDatasetInfoMixin.homepage   r   rv   c                     | j         j        S rp   )rq   licenserx   s    rt   r   zDatasetInfoMixin.license       z!!rv   c                     | j         j        S rp   )rq   size_in_bytesrx   s    rt   r   zDatasetInfoMixin.size_in_bytes   r   rv   c                     | j         j        S rp   )rq   supervised_keysrx   s    rt   r   z DatasetInfoMixin.supervised_keys   s    z))rv   c                     | j         j        S rp   )rq   task_templatesrx   s    rt   r   zDatasetInfoMixin.task_templates   s    z((rv   c                     | j         j        S rp   )rq   versionrx   s    rt   r   zDatasetInfoMixin.version   r   rv   N)__name__
__module____qualname____doc__rD   r   rH   ru   propertyrm   rn   strr|   r   r   intr   r   dictr   r   r(   r   r   r   r   r   r   r    rv   rt   rl   rl      s        [ *1E       X   X 'c ' ' ' X' ## # # # X# &S & & & X& 'hsm ' ' ' X' &S & & & X& -HTN - - - X- (x} ( ( ( X( W(8, W W W XW #(3- # # # X# "# " " " X" (x} ( ( ( X( * * X* ) ) X) " " X" " "rv   rl   c                   :   e Zd Z e            Ze	 	 	 ddddededee	e
                  dee         d	efd
            Z	 	 	 	 	 	 	 	 	 ddedeee
e	e
         f                  dedee         dedeee
ef                  deee
e	e
         f                  deded	efdZdS )TensorflowDatasetMixinN   datasetDataset
collate_fncollate_fn_argscols_to_retain
batch_sizenum_test_batchesc                 D   t           j        rddl}nt          d          t	          |           dk    rt          d          |t          t	          |           |          }d}!t          t          g dz                       g }t          |          D ]}	t          t          t	          |                     |          }
| |
          fd                                D             fdt          |          D              |fi ||                               i }i }|d                                         D ]Rfd	|D             }g }|D ]}t          |t          j                  r|                    |           2t          ||j                  r(|                    |                                           o|                    t          j        |                     t          j        |d         j        t          j                  s|d         j        t.          k    r|j        }t          j        }nt          j        |d         j        t          j                  r|j        }t          j        }nH|d         j        j        d
k    rt          j        }|j        }nt=          d|d         j         d          d |D             }g }t          t	          |d                             D ]|fd|D             }dk    r|                    |           ,t	          |          dk    r(|                    |                                           g|                    d           }|                     ||          |<   ||<   T||fS )aH  Private method used by `to_tf_dataset()` to find the shapes and dtypes of samples from this dataset
           after being passed through the collate_fn. Tensorflow needs an exact signature for tf.numpy_function, so
           the only way to do this is to run test batches - the collator may add or rename columns, so we can't figure
           it out just by inspecting the dataset.

        Args:
            dataset (`Dataset`): Dataset to load samples from.
            collate_fn(`bool`): Shuffle the dataset order when loading. Recommended True for training, False for
                validation/evaluation.
            collate_fn(`Callable`): A function or callable object (such as a `DataCollator`) that will collate
                lists of samples into a batch.
            collate_fn_args (`Dict`): A `dict` of keyword arguments to be passed to the
                `collate_fn`.
            batch_size (`int`, optional): The size of batches loaded from the dataset. Used for shape inference.
                Can be None, which indicates that batch sizes can be variable.
            num_test_batches (`int`): The number of batches to load from the dataset for shape inference.

        Returns:
            `dict`: Dict mapping column names to tf.Tensorspec objects
            `dict`: Dict mapping column names to np.dtype objects
        r   NFCalled a Tensorflow-specific function but Tensorflow is not installed.z@Unable to get the output signature because the dataset is empty.r   )	label_idslabellabelsc                 $    i | ]\  }}|v 	||S r   r   ).0keyvaluer   s      rt   
<dictcomp>z@TensorflowDatasetMixin._get_output_signature.<locals>.<dictcomp>
  s+    gggZS%QTXfQfQfc5QfQfQfrv   c                 R    g | ]"fd                                  D             #S )c                 (    i | ]\  }}||         S r   r   )r   r   r   is      rt   r   zKTensorflowDatasetMixin._get_output_signature.<locals>.<listcomp>.<dictcomp>  s#    NNNZS%3aNNNrv   items)r   r   
test_batchs    @rt   
<listcomp>z@TensorflowDatasetMixin._get_output_signature.<locals>.<listcomp>  s<    oooSTNNNN:;K;K;M;MNNNooorv   c                      g | ]
}|         S r   r   )r   batchcolumns     rt   r   z@TensorflowDatasetMixin._get_output_signature.<locals>.<listcomp>  s    BBBE%-BBBrv   UzUnrecognized array dtype z<. 
Nested types and image/audio types are not supported yet.c                     g | ]	}|j         
S r   )shape)r   arrays     rt   r   z@TensorflowDatasetMixin._get_output_signature.<locals>.<listcomp>+  s    999eek999rv   c                      h | ]
}|         S r   r   )r   r   dims     rt   	<setcomp>z?TensorflowDatasetMixin._get_output_signature.<locals>.<setcomp>.  s    888s888rv   )r   dtype)!r    TF_AVAILABLE
tensorflowImportErrorlen
ValueErrorminlistsetranger   r   appendkeys
isinstancenpndarrayTensornumpyr   
issubdtyper   integerboolint64numberfloat32kindunicode_stringRuntimeErrorpop
TensorSpec)r   r   r   r   r   r   tftest_batch_sizetest_batches_indicestf_columns_to_signaturesnp_columns_to_dtypes
raw_arrays	np_arraysr   tf_dtypenp_dtypeshapesstatic_shapesizesr   r   r   s      `                 @@@rt   _get_output_signaturez,TensorflowDatasetMixin._get_output_signature   s   <  	h#####fgggw<<1_```!S\\:66J%!#n7W7W7W&W"X"XYYN'(( 	, 	,AU3w<<00/BBG )J)gggg:;K;K;M;Mggg
ooooX]^mXnXnoooJ#JBB/BBJ
++++#% !"1o**,, &	4 &	4FBBBB\BBBJI# 6 6eRZ00 6$$U++++ry11 6$$U[[]]3333$$RXe__5555}Yq\/<< 	!@RVZ@Z@Z88y|129== 
::1#(C//;9"P	!0B P P P   :9y999FLS^^,, . .8888888!88 ''
333u::?? ''		4444 ''----/1}}<W_}/`/`$V,+3 ((')===rv   FTr   columnsshuffledrop_remainder
label_colsprefetchnum_workersc                     t           j        rddl}nt          d          t	          t
                    rt                    dk    s(t	          t
                    r-t                    dk    rt          j        dt                     t	          |j
                                        |j
        j                  rt                              d           |	dk    rt          j        dk     rt#          d          |t$          }|i }rst#          d	          g nt	          t&                    rgt          t)                              t                    k     rt#          d
          rtt	          t&                    rgt          t)                              t                    k     rt#          d          t          t)          z                       }nd}g  j        d         dk    r                     d          n                     ||||r|nd|
          \  }}d|v r8dv sdv rdvrd D             dgz   dv sdv rdvrd D             dgz   D ]}||vrt#          d| d          D ]}||vrt#          d| d          |	dk    rt1          ||||||||	  	        }n/|	dk    rt3          |||||||||	
  
        }nt#          d          fd}||                    |          }|r$|                    |j        j        j                  } fd} j                             tC          j"        ||                     |S )au  Create a `tf.data.Dataset` from the underlying Dataset. This `tf.data.Dataset` will load and collate batches from
        the Dataset, and is suitable for passing to methods like `model.fit()` or `model.predict()`. The dataset will yield
        `dicts` for both inputs and labels unless the `dict` would contain only a single key, in which case a raw
        `tf.Tensor` is yielded instead.

        Args:
            batch_size (`int`):
                Size of batches to load from the dataset.
            columns (`List[str]` or `str`, *optional*):
                Dataset column(s) to load in the `tf.data.Dataset`.
                Column names that are created by the `collate_fn` and that do not exist in the original dataset can be used.
            shuffle(`bool`, defaults to `False`):
                Shuffle the dataset order when loading. Recommended `True` for training, `False` for
                validation/evaluation.
            drop_remainder(`bool`, defaults to `False`):
                Drop the last incomplete batch when loading. Ensures
                that all batches yielded by the dataset will have the same length on the batch dimension.
            collate_fn(`Callable`, *optional*):
                A function or callable object (such as a `DataCollator`) that will collate
                lists of samples into a batch.
            collate_fn_args (`Dict`, *optional*):
                An optional `dict` of keyword arguments to be passed to the
                `collate_fn`.
            label_cols (`List[str]` or `str`, defaults to `None`):
                Dataset column(s) to load as labels.
                Note that many models compute loss internally rather than letting Keras do it, in which case
                passing the labels here is optional, as long as they're in the input `columns`.
            prefetch (`bool`, defaults to `True`):
                Whether to run the dataloader in a separate thread and maintain
                a small buffer of batches for training. Improves performance by allowing data to be loaded in the
                background while the model is training.
            num_workers (`int`, defaults to `0`):
                Number of workers to use for loading the dataset. Only supported on Python versions >= 3.8.
            num_test_batches (`int`, defaults to `20`):
                Number of batches to use to infer the output signature of the dataset.
                The higher this number, the more accurate the signature will be, but the longer it will take to
                create the dataset.

        Returns:
            `tf.data.Dataset`

        Example:

        ```py
        >>> ds_train = ds["train"].to_tf_dataset(
        ...    columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        ...    shuffle=True,
        ...    batch_size=16,
        ...    collate_fn=data_collator,
        ... )
        ```
        r   Nr   r   a  The output of `to_tf_dataset` will change when a passing single element list for `labels` or `columns` in the next datasets version. To return a tuple structure rather than dict, pass a single string.
Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) a>  Note that to_tf_dataset() loads the data with a generator rather than a full tf.data pipeline and is not compatible with remote TPU connections. If you encounter errors, please try using a TPU VM or, if your data can fit in memory, loading it into memory as a dict of Tensors instead of streaming with to_tf_dataset().)      zCUsing multiple workers is only supported on Python versions >= 3.8.z5Cannot specify label_cols without specifying columns!z'List of label_cols contains duplicates.z$List of columns contains duplicates.typecustomr   )r   r   r   r   r   r   r   r   c                     g | ]}|d v|	S )r   r   r   r   cols     rt   r   z8TensorflowDatasetMixin.to_tf_dataset.<locals>.<listcomp>  s#    WWW3S@V5V5V35V5V5Vrv   c                     g | ]}|d v|	S r  r   r  s     rt   r   z8TensorflowDatasetMixin.to_tf_dataset.<locals>.<listcomp>  s#    ]]]c3F\;\;\c;\;\;\rv   zColumn z not found in dataset!zLabel column )	r   r   r   r   columns_to_np_typesoutput_signaturer   r   r   )
r   r   r   r   r  r  r   r   r   r   znum_workers must be >= 0c                    fd|                                  D             }fd|                                  D             }t          |          dk    r't          |                                          d         }t          |          dk    r't          |                                          d         }t	          |t
                    rt          |          dk    r|S ||fS )Nc                 $    i | ]\  }}|v 	||S r   r   )r   r   tensorr   s      rt   r   z[TensorflowDatasetMixin.to_tf_dataset.<locals>.split_features_and_labels.<locals>.<dictcomp>  s%    ]]]VcU\nnVnnnrv   c                 $    i | ]\  }}|v 	||S r   r   )r   r   r  r   s      rt   r   z[TensorflowDatasetMixin.to_tf_dataset.<locals>.split_features_and_labels.<locals>.<dictcomp>  s*    ^^^kc6CS]L]L]c6L]L]L]rv   r   r   )r   r   r   valuesr   r   )input_batchr   r   r   r   s      rt   split_features_and_labelszGTensorflowDatasetMixin.to_tf_dataset.<locals>.split_features_and_labels  s    ]]]]{7H7H7J7J]]]H^^^^[5F5F5H5H^^^F8}}!! 1 122156{{afmmoo..q1&$'' (CKK1,<,<''rv   c                 d                                      j                            |            d S rp   )__del___TF_DATASET_REFSremove)refr   rs   s    rt   cleanup_callbackz>TensorflowDatasetMixin.to_tf_dataset.<locals>.cleanup_callback  s0    OO!((-----rv   )#r    r   r   r   r   r   r   warningswarnFutureWarning
distributeget_strategyTPUStrategyloggerwarningsysversion_infor   rd   r   r   formatwith_formatr   rc   re   mapr   dataexperimentalAUTOTUNEr  addweakrefr  )rs   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  
tf_datasetr  r  r   s   ` `    `           @rt   to_tf_datasetz$TensorflowDatasetMixin.to_tf_dataset;  s   B  	h#####fgggw%% 	#g,,!*;*;z4(( +<-0__-A-AMY 	 	 	 bm0022BM4MNN 	NNE   ??s/&88bccc.J" O 	Vg 	VTUUUJJ
C(( 	&$Js:#j//11FGGG 	'3'' $")3w<<  3w<<// !GHHH!#g
&:";";<<NN!NG;v(**&&w//GGG 180M0M!+)%3=zz- 1N 1
 1
-- '''w&&'W*<*<(RYBYBYWW'WWW[cZddz))W
-B-BXbHbHb]]Z]]]ai`jj
 	H 	HC*** !F3!F!F!FGGG +  	N 	NC*** !L!L!L!LMMM + !&-% /$7!1%-
 
 
JJ 1__3-% /$7!1%-'  JJ 7888	( 	( 	( 	( 	( 	( %#(ABBJ 	L#,,RW-A-JKKJ	. 	. 	. 	. 	. 	. 	!!'+j:J"K"KLLLrv   )NNr   )	NFNFNNTr   r   )r   r   r   r   r  staticmethodr   r   r   r   r   r   r   r   r   r   r   r)  r   rv   rt   r   r      s~       suu
 /3$( "_> _>_>_> _> !c+	_>
 SM_> _> _> _> \_>H 48)-$486: "J JJ %T#Y/0J 	J
 X&J J "$sCx.1J U3S	>23J J J J J J J J Jrv   r   c                       e Zd ZdS )$DatasetTransformationNotAllowedErrorNr   r   r   r   rv   rt   r,  r,    s        Drv   r,  c                 J     t                      fd            }d|_        |S )z|Wrapper for dataset transforms that recreate a new Dataset to transmit the format of the original dataset to the new datasetc                     | r| d         }| dd          } n|                     d          }t          |j                  t          |j        pg           z
  }|j        |j        |j        |j        d} |g| R i |}t          |t                    r!t          |
                                          n|g}|D ]}|                                }|d         't          t          |j                  |z
            |d<   |j        |j        |j        t          |j                  nd |j        d}	|	|k    r|j        }
 |j        di | |
|_        |S )Nr   r   rs   r   format_kwargsr   output_all_columnsr   r   )r   r   column_names_format_columns_format_type_format_kwargs_output_all_columnsr   r   r   r  r   sorted_fingerprint
set_format)argskwargsrs   unformatted_columnsself_formatoutdatasetsr   
new_format
out_formatfingerprintfuncs              rt   wrapperz transmit_format.<locals>.wrapper  s    	1"1gD8DD$jj00D!$"344s4;O;USU7V7VV%!0+"&":	
 
 04tD/J4/J/J/J6/J/J:DS$:O:O$ZD$6$6$6VYUZ 	3 	3G$))++J)$0(.s73G/H/HK^/^(_(_
9%,!(!7>E>U>a6'"9:::gk&-&A	 J Z''%2""00Z000'2$
rv   transmit_formatr   _decorator_name_rD  rE  s   ` rt   rF  rF    s>     4[[! ! ! ! [!F  1GNrv   c                 J     t                      fd            }d|_        |S )zWrapper for dataset transforms that recreate a new Dataset to transmit the task templates of the original dataset to the new datasetc                  R   | r| d         | dd          } n|                     d           g| R i |}t          |t                    r!t          |                                          n|g}|D ]1j        j        #fdj        j        D             j        _        2|S )Nr   r   rs   c                 z    g | ]7}t          fd |j                                        D                       5|8S )c              3      K   | ]D}j         j                            |          j         j                            |          k    V  Ed S rp   )rq   r   get)r   kr   rs   s     rt   	<genexpr>z=transmit_tasks.<locals>.wrapper.<locals>.<listcomp>.<genexpr>J  sc          .221559L9P9PQR9S9SS     rv   )allcolumn_mappingr   )r   templater   rs   s     rt   r   z3transmit_tasks.<locals>.wrapper.<locals>.<listcomp>G  ss     / / /      !)!8!=!=!?!?    // / /rv   )r   r   r   r   r  rm   r   )r;  r<  r?  r@  r   rs   rD  s       @@rt   rE  ztransmit_tasks.<locals>.wrapper:  s     	1"1gD8DD$jj00D/3tD/J4/J/J/J6/J/J:DS$:O:O$ZD$6$6$6VYUZ 
	 
	Gy'3/ / / / /$(I$</ / /+ 
rv   transmit_tasksrG  rI  s   ` rt   rT  rT  7  s=     4[[    [,  0GNrv   tabler   c                 &   t          fd| j        D                       | j        j        d| j        j        vr#t	          j        t                              }nt          j        | j        j        d         	                                          }d|vr!t          t                              |d<   n,t          t                              d         |d         d<   dt          j        |          i}|                     |          } | S )zTo be used in dataset transforms that modify the features of the dataset, in order to update the features stored in the metadata of its schema.c                 "    i | ]}||         S r   r   )r   col_namer   s     rt   r   z1update_metadata_with_features.<locals>.<dictcomp>W  s     YYY(8H#5YYYrv   N   huggingfacer   rm   r   huggingface)r(   r3  schemametadatar"   _build_metadatarD   jsonloadsdecoder^   dumpsreplace_schema_metadata)rU  r   pa_metadatar]  s    `  rt   update_metadata_with_featuresre  U  s   YYYYeFXYYYZZH|$el>S(S(S!1+x2P2P2PQQ:el3NCJJLLMM!!%k8&D&D&DEEHV+1+x2P2P2P+Q+QR\+]HVZ($dj&:&:;))+66ELrv   rz   c                     t          | t          j                  rt          |           S t          | t                    r| S t	          d|  d          )zVWe check the table type to make sure it's an instance of :class:`datasets.table.Table`zCExpected a pyarrow.Table or a datasets.table.Table object, but got .)r   parN   rL   	TypeError)rU  s    rt   _check_tablerj  e  sZ    %"" h U###	E5	!	! hf^cfffgggrv   r3  c                     t          |           t          d                                 D                       s!fdD             }t          d| d          dS )zBCheck the column names to make sure they don't contain duplicates.c              3   "   K   | ]
}|d k    V  dS )r   Nr   )r   counts     rt   rP  z&_check_column_names.<locals>.<genexpr>t  s&      88euz888888rv   c                 ,    g | ]}|         d k    |S r   r   )r   r  counters     rt   r   z'_check_column_names.<locals>.<listcomp>u  s'    IIIcq8H8Hc8H8H8Hrv   z4The table can't have duplicated columns but columns z are duplicated.N)r   rQ  r  r   )r3  duplicated_columnsrp  s     @rt   _check_column_namesrr  q  s{    l##G88w~~'7'788888 vIIIIWIIItPbtttuuuv vrv   c                 \    | dk     r	| |z   dk     s| |k    rt          d|  d| d          d S )Nr   zIndex z" out of range for dataset of size rg  )
IndexError)indexsizes     rt   _check_valid_indices_valuerw  y  sJ    		edlQ&&ETMMR%RR4RRRSSS -:Mrv   c                       e Zd ZdZdS )NonExistentDatasetErrorz.Used when we expect the existence of a datasetN)r   r   r   r   r   rv   rt   ry  ry  ~  s        88Drv   ry  c            .           e Zd ZdZ	 	 	 	 ddedee         dee         dee         dee         f
dZ	e
d	ef fd
            Ze	 	 	 	 ddedee         dee         dee         ded	d fd            Ze	 	 	 ddej        dee         dee         deej                 d	d f
d            Ze	 	 	 	 ddej        dee         dee         dee         dee         d	d fd            Ze	 	 	 ddedee         dee         dee         d	d f
d            Ze	 	 	 ddee         dee         dee         dee         d	d f
d            Ze	 	 	 	 	 ddeeee         f         dee         dee         dededee         fd            Ze	 	 	 	 	 ddedee         deded ee         dee         fd!            Z e	 	 	 	 	 	 ddeeee         f         dee         dee         deded"ee         dee         fd#            Z!e	 	 	 	 	 	 ddeeee         f         dee         dee         deded$eee                  dee         fd%            Z"e	 	 	 	 	 ddeeee         f         dee         dee         dededee         fd&            Z#e	 	 	 	 ddd(dee         dee         ded)ef
d*            Z$e	 	 	 dd+eed,f         d-eed.d/d0f         dee         dedef
d1            Z%d2 Z&d3 Z'd4 Z(	 	 	 	 	 dd6ed7eeeef                  d8ee         dee         d9ee         f
d:Z)ed;ed<d d=ed9ee         fd>            Z*ed?ed	e+fd@            Z,e	 	 	 dd6edee         d9ee         d	d fdA            Z-e
d	efdB            Z.e
d	ee         fdC            Z/e
d	efdD            Z0e
d	efdE            Z1e
d	ee         fdF            Z2e
d	e3eef         fdG            Z4dHed	efdIZ5ddHedJed	d fdKZ6 e7dL          ddNee         d	d fdO            Z8	 	 	 	 	 	 ddedQee         ded)ee         dRee         dSee         dee         d	d fdTZ9 e7dL          ddHedUe:dNee         d	d fdV            Z;e<e= e7dL          ddWeeee         f         dNee         d	d fdX                                    Z>e< e7dL          	 ddYedZedNee         d	d fd[                        Z?e< e7dL          dd\e@eef         dNee         d	d fd]                        ZAe<e= e7dL          ddWeeee         f         dNee         d	d fd^                                    ZBd_ ZCd` ZDddQedaefdbZEdc ZFe
dd             ZGeHjI        	 	 	 ddeee         d$ee         dfefdg            ZJ e7d'L          	 	 	 ddeee         d$ee         dfefdh            ZKdi ZL	 	 ddjee         d$ee         dfefdkZM	 	 	 ddeee         d$ee         dfefdlZN	 	 ddjee         d$ee         dfefdmZOddoeeePf         dped	d fdqZQdreeeRef         d	ee@ef         fdsZSeTdreeeReUe         f         d	e@fdt            ZVeTdred	efdu            ZVdv ZVdwed	efdxZWd	efdyZXdz ZYe<e=	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd|ee         d}ed~edeeeee         f                  dedQee         daedeeeee         f                  ded)ee         dRee         dSee         dee         dedee         dee         dedNee         dee         d	d f(d                        ZZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd<d d|ee         d}ed~edeee                  dedQee         daedeee                  dedRee         dSee         dee         dedee         dNee         dee         ded	eUe3eeeed f         f                  f&d            Z[e= e7dg dd          	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd|ee         deeeee         f                  dedQee         ded)ee         dRee         dSee         dee         dee         dedNee         dee         d	d fd                        Z\e= e7ddRg          	 	 	 	 	 	 	 ddedRee         dSee         dee         dedee         dNee         d	d fd                        Z]	 	 	 ddee         deej                 dee         d	d fdZ^e= e7ddg          	 	 	 	 ddeUdedee         dSee         dNee         d	d fd                        Z_e= e7dL          	 ddededNee         d	d fd                        Z`e= e7ddg          	 	 	 	 ddeUdedee         dSee         dNee         d	d fd                        Zae= e7dd)dg          	 	 	 	 	 	 	 	 ddWeeebe         f         deeebe         f         deded)ee         dee         dSee         dNee         d	d fd                        Zce= e7dd'd)dg          	 	 	 	 	 	 	 ddee         deedje        jf                 ded)ee         dee         dSee         dNee         d	d fd                        Zge= e7dd'ddgg d          	 	 	 	 	 	 	 	 	 	 	 	 	 ddeehedf         deehedf         dedee         dee         deedje        jf                 ded)ee         dee         dee         dSee         dee         dee         d	dfd                        Zi	 	 	 	 dd8ededededee         dSee         d	d fdZj	 ddedefdZk	 	 ddeeelf         dQee         dee         d	efdZmddQee         d	eeene         f         fdZod	epfdZq	 	 ddeeelf         dQee         dee         d	efdZr	 ddQee         ded	eej        enej                 f         fdZs	 ddeeelf         dQee         d	efdZt	 dded-eed.d/d0f         dQee         d	efdZud	efdZveded          fd            Zwdd8ee         d	dfdZx	 	 	 	 	 	 	 ddedee         dee         dee         dee         d7eeeef                  d8ee         ded	e3eeeef         fdńZy	 	 	 	 	 	 	 ddedee         dee         dee         dee         d7eeeef                  d8ee         defdƄZze= e7dL          dedHeepedj{        f         dNefdǄ                        Z|ddddddPddedj}        f	dHedee         dee         dee         dee         ded         dQedee         def fdτZ~dddddPddedj}        fdedj{        dedee         dee         dee         ded         dQedee         def fdфZ	 	 	 	 	 	 ddHedee         dee         dee         ded         dee         dee         f fd؄Ze= e7dL          dedNefdڄ                        Zde@ded	d fd݄Z xZS )r   z#A Dataset backed by an Arrow table.Narrow_tablerm   rn   indices_tablerC  c           
         ||                                 nt                      }t                              | ||           t	          j        |            t          |          | _        |t          |          nd | _        t          |            d | _	        i | _
        d | _        d| _        || _        | j        j        j        fd| j        j        j        v rSt!          j        | j        j        j        d                                                   }d|v r| j        |d         | _        t'          j        |j                  }| j        j        || j        _        nO	 | j        j                            |          | j        _        n$# t0          $ r}t1          | d          d }~ww xY w| j        t3          |           | _        | j        j        t1          d          | j        t1          d          | j        j        j        |j        k    r9t1          d| j        j         d	| j        j        j         d
| d	|j                   | j        kt8          j                            | j                            d          j                  s/t1          d| j                            d          j                   tA          | j        j!                   tE          | j        | j        j                  | _        d S )Nrm   rn   FrY  rC  zn
The 'source' features come from dataset_info.json, and the 'target' ones are those of the dataset arrow file.*Features can't be None in a Dataset objectz-Fingerprint can't be None in a Dataset objectz4External features info don't match the dataset:
Got
z
with type
z

but expected something like
r   zEindices must be an Arrow table of unsigned integers, current type is )#r   rD   rl   ru   rG   rj  _data_indicesr:   r5  r6  r4  r7  r9  r\  r]  r_  r`  ra  r(   from_arrow_schemarm   r   reorder_fields_asr   r6   rq   r   rh  typesis_unsigned_integerr   rr  r3  re  )	rs   r{  rm   rn   r|  rC  r]  inferred_featureses	            rt   ru   zDataset.__init__  s     #.tyy{{{KMM!!$T!???%%%(55
HUHam)D)D)Dgk4T:::+/$&/3). !, :%1n
HYHb6b6bz$*"3"<^"L"S"S"U"UVVH))d.?.G$,]$;! %6{7IJJ9%!2DI%)Y%7%I%IJ[%\%\	""      J  J  J   $ 4T : :D :&IJJJ$LMMM9"&7&<<< ]I[  ]  ]jnjsj|  kB  ]  ]  ev  ]  ]  EV  E[  ]  ]   =$8//0D0DQ0G0G0LMM  z\`\i\p\pqr\s\s\xzz   	DJ344424:tz?RSS


s   )F 
F)F$$F)rz   c                 P    t                      j        }|t          d          |S )Nr  )superr   r   )rs   r   	__class__s     rt   r   zDataset.features  s)    77#IJJJrv   Ffilenameindices_filename	in_memoryc                     t          j        ||          }|t          j        ||          }nd} | ||||          S )aX  Instantiate a Dataset backed by an Arrow table at filename.

        Args:
            filename (`str`):
                File name of the dataset.
            info (`DatasetInfo`, *optional*):
                Dataset information, like description, citation, etc.
            split (`NamedSplit`, *optional*):
                Name of the dataset split.
            indices_filename (`str`, *optional*):
                File names of the indices.
            in_memory (`bool`, defaults to `False`):
                Whether to copy the data in-memory.

        Returns:
            [`Dataset`]
        )r  N)r{  rm   rn   r|  )r!   
read_table)clsr  rm   rn   r  r  rU  indices_pa_tables           rt   	from_filezDataset.from_file  se    4 &x9EEE'*56FR[\\\#s*	
 
 
 	
rv   bufferindices_bufferc                 z    t          j        |          }|t          j        |          }nd} | ||||          S )a  Instantiate a Dataset backed by an Arrow buffer.

        Args:
            buffer (`pyarrow.Buffer`):
                Arrow buffer.
            info (`DatasetInfo`, *optional*):
                Dataset information, like description, citation, etc.
            split (`NamedSplit`, *optional*):
                Name of the dataset split.
            indices_buffer (`pyarrow.Buffer`, *optional*):
                Indices Arrow buffer.

        Returns:
            [`Dataset`]
        N)rm   rn   r|  )rL   from_buffer)r  r  rm   rn   r  rU  r|  s          rt   r  zDataset.from_buffer  sJ    . )&11%)5f==MM Ms5t5NNNNrv   dfr   preserve_indexc                     |'|%|j         |k    rt          d| d|j                    ||n
||j         nd}|t                      }||_         t          j        ||          }||                    |j                  } | |||          S )a  
        Convert `pandas.DataFrame` to a `pyarrow.Table` to create a [`Dataset`].

        The column types in the resulting Arrow Table are inferred from the dtypes of the `pandas.Series` in the
        DataFrame. In the case of non-object Series, the NumPy dtype is translated to its Arrow equivalent. In the
        case of `object`, we need to guess the datatype by looking at the Python objects in this Series.

        Be aware that Series of the `object` dtype don't carry enough information to always lead to a meaningful Arrow
        type. In the case that we cannot infer a type, e.g. because the DataFrame is of length 0 or the Series only
        contains `None/nan` objects, the type is set to `null`. This behavior can be avoided by constructing explicit
        features and passing it to this function.

        Args:
            df (`pandas.DataFrame`):
                Dataframe that contains the dataset.
            features ([`Features`], *optional*):
                Dataset features.
            info (`DatasetInfo`, *optional*):
                Dataset information, like description, citation, etc.
            split (`NamedSplit`, *optional*):
                Name of the dataset split.
            preserve_index (`bool`, *optional*):
                Whether to store the index as an additional column in the resulting Dataset.
                The default of `None` will store the index as a column, except for `RangeIndex` which is stored as metadata only.
                Use `preserve_index=True` to force it to be stored as a column.

        Returns:
            [`Dataset`]

        Example:

        ```py
        >>> ds = Dataset.from_pandas(df)
        ```
        NIFeatures specified in `features` and `info.features` can't be different:

)r  r  r~  )r   r   rD   rL   from_pandascastarrow_schema)r  r  r   rm   rn   r  rU  s          rt   r  zDataset.from_pandas  s    X  4(9R9Rx]exximivxx    (388$JZ`d<==D ))
 
 
  JJx455Es5t51111rv   mappingc                    |'|%|j         |k    rt          d| d|j                    ||n
||j         nd}i }|                                D ]\  }}t          |t          j        t          j        f          r|t          |||                   n|}n4t          ||	                    ||          n||||         nd|          }|||<   |}t          j        |          }|t                      }||_         |j         0t          d |                                D                       |_          | |||          S )a  
        Convert `dict` to a `pyarrow.Table` to create a [`Dataset`].

        Args:
            mapping (`Mapping`):
                Mapping of strings to Arrays or Python lists.
            features ([`Features`], *optional*):
                Dataset features.
            info (`DatasetInfo`, *optional*):
                Dataset information, like description, citation, etc.
            split (`NamedSplit`, *optional*):
                Name of the dataset split.

        Returns:
            [`Dataset`]
        Nr  r  )r   r  )r  c                     i | ]T\  }}|t          |t          j        t          j        f          rt	          |j                  n|                                US r   )r   rh  ArrayChunkedArrayr.   r   get_inferred_type)r   r  r#  s      rt   r   z%Dataset.from_dict.<locals>.<dictcomp>  sh        "T !$2?(CDD21$)<<<//11  rv   r~  )r   r   r   r   rh  r  r  rO   r#   encode_columnrL   from_pydictrD   r(   )	r  r  r   rm   rn   arrow_typed_mappingr  r#  pa_tables	            rt   	from_dictzDataset.from_dictX  s   0  4(9R9Rx]exximivxx    (388$JZ`d   		, 		,IC$2? ;<< EMEY,T8C=AAA_c-9A9MH**4555SW*2*>#D  
 (,$$% ,W===<==D = $  &-]]__	   DM s8$e4444rv   c                 d    rfdd         D             ni |                      |||          S )ax  
        Convert a list of dicts to a `pyarrow.Table` to create a :class:`Dataset`.

        Note that the keys of the first entry will be used to determine the dataset columns,
        regardless of what is passed to features.

        Args:
            mapping (`List[dict]`): A list of mappings of strings to row values.
            features (:class:`Features`, optional): Dataset features.
            info (:class:`DatasetInfo`, optional): Dataset information, like description, citation, etc.
            split (:class:`NamedSplit`, optional): Name of the dataset split.

        Returns:
            :class:`Dataset`
        c                 0    i | ]fd D             S )c                 :    g | ]}|                               S r   )rN  )r   rrO  s     rt   r   z0Dataset.from_list.<locals>.<dictcomp>.<listcomp>  s#    111AquuQxx111rv   r   )r   rO  r  s    @rt   r   z%Dataset.from_list.<locals>.<dictcomp>  s1    FFFa11111111FFFrv   r   )r  )r  r  r   rm   rn   s    `   rt   	from_listzDataset.from_list  sH    0 KRYFFFF71:FFFFWY}}Whe<<<rv   path_or_paths	cache_dirkeep_in_memorynum_procc           	      P    ddl m}  || f|||||d|                                S )a  Create Dataset from CSV file(s).

        Args:
            path_or_paths (`path-like` or list of `path-like`):
                Path(s) of the CSV file(s).
            split ([`NamedSplit`], *optional*):
                Split name to be assigned to the dataset.
            features ([`Features`], *optional*):
                Dataset features.
            cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`):
                Directory to cache data.
            keep_in_memory (`bool`, defaults to `False`):
                Whether to copy the data in-memory.
            num_proc (`int`, *optional*, defaults to `None`):
                Number of processes when downloading and generating the dataset locally.
                This is helpful if the dataset is made of multiple files. Multiprocessing is disabled by default.

                <Added version="2.8.0"/>
            **kwargs (additional keyword arguments):
                Keyword arguments to be passed to [`pandas.read_csv`].

        Returns:
            [`Dataset`]

        Example:

        ```py
        >>> ds = Dataset.from_csv('path/to/dataset.csv')
        ```
        r   )CsvDatasetReaderrn   r   r  r  r  )io.csvr  read)r  rn   r   r  r  r  r<  r  s           rt   from_csvzDataset.from_csv  s]    R 	-,,,,,
)
 
 
 
 $&&	rv   	generator
gen_kwargsc           
      P    ddl m}  |d| |||||d|                                S )a  Create a Dataset from a generator.

        Args:
            generator (:`Callable`):
                A generator function that `yields` examples.
            features ([`Features`], *optional*):
                Dataset features.
            cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`):
                Directory to cache data.
            keep_in_memory (`bool`, defaults to `False`):
                Whether to copy the data in-memory.
            gen_kwargs(`dict`, *optional*):
                Keyword arguments to be passed to the `generator` callable.
                You can define a sharded dataset by passing the list of shards in `gen_kwargs`.
            num_proc (`int`, *optional*, defaults to `None`):
                Number of processes when downloading and generating the dataset locally.
                This is helpful if the dataset is made of multiple files. Multiprocessing is disabled by default.

                <Added version="2.7.0"/>
            **kwargs (additional keyword arguments):
                Keyword arguments to be passed to :[`GeneratorConfig`].

        Returns:
            [`Dataset`]

        Example:

        ```py
        >>> def gen():
        ...     yield {"text": "Good", "label": 0}
        ...     yield {"text": "Bad", "label": 1}
        ...
        >>> ds = Dataset.from_generator(gen)
        ```

        ```py
        >>> def gen(shards):
        ...     for shard in shards:
        ...         with open(shard) as f:
        ...             for line in f:
        ...                 yield {"line": line}
        ...
        >>> shards = [f"data{i}.txt" for i in range(32)]
        >>> ds = Dataset.from_generator(gen, gen_kwargs={"shards": shards})
        ```
        r   )GeneratorDatasetInputStream)r  r   r  r  r  r  r   )io.generatorr  r  )r  r   r  r  r  r  r<  r  s           rt   from_generatorzDataset.from_generator  s]    p 	>=====** 
)!
 
 
 
 $&&	rv   fieldc           
      R    ddl m}  || f||||||d|                                S )a"  Create Dataset from JSON or JSON Lines file(s).

        Args:
            path_or_paths (`path-like` or list of `path-like`):
                Path(s) of the JSON or JSON Lines file(s).
            split ([`NamedSplit`], *optional*):
                Split name to be assigned to the dataset.
            features ([`Features`], *optional*):
                 Dataset features.
            cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`):
                Directory to cache data.
            keep_in_memory (`bool`, defaults to `False`):
                Whether to copy the data in-memory.
            field (`str`, *optional*):
                Field name of the JSON file where the dataset is contained in.
            num_proc (`int`, *optional* defaults to `None`):
                Number of processes when downloading and generating the dataset locally.
                This is helpful if the dataset is made of multiple files. Multiprocessing is disabled by default.

                <Added version="2.8.0"/>
            **kwargs (additional keyword arguments):
                Keyword arguments to be passed to [`JsonConfig`].

        Returns:
            [`Dataset`]

        Example:

        ```py
        >>> ds = Dataset.from_json('path/to/dataset.json')
        ```
        r   )JsonDatasetReader)rn   r   r  r  r  r  )io.jsonr  r  )	r  rn   r   r  r  r  r  r<  r  s	            rt   	from_jsonzDataset.from_json$  s`    X 	/.....  	
)	
 	
 	
 	
 $&&		rv   r   c           
      R    ddl m}  || f||||||d|                                S )a  Create Dataset from Parquet file(s).

        Args:
            path_or_paths (`path-like` or list of `path-like`):
                Path(s) of the Parquet file(s).
            split (`NamedSplit`, *optional*):
                Split name to be assigned to the dataset.
            features (`Features`, *optional*):
                Dataset features.
            cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`):
                Directory to cache data.
            keep_in_memory (`bool`, defaults to `False`):
                Whether to copy the data in-memory.
            columns (`List[str]`, *optional*):
                If not `None`, only these columns will be read from the file.
                A column name may be a prefix of a nested field, e.g. 'a' will select
                'a.b', 'a.c', and 'a.d.e'.
            num_proc (`int`, *optional*, defaults to `None`):
                Number of processes when downloading and generating the dataset locally.
                This is helpful if the dataset is made of multiple files. Multiprocessing is disabled by default.

                <Added version="2.8.0"/>
            **kwargs (additional keyword arguments):
                Keyword arguments to be passed to [`ParquetConfig`].

        Returns:
            [`Dataset`]

        Example:

        ```py
        >>> ds = Dataset.from_parquet('path/to/dataset.parquet')
        ```
        r   )ParquetDatasetReader)rn   r   r  r  r   r  )
io.parquetr  r  )	r  rn   r   r  r  r   r  r<  r  s	            rt   from_parquetzDataset.from_parquet]  s`    \ 	544444##	
)	
 	
 	
 	
 $&&		rv   c           	      P    ddl m}  || f|||||d|                                S )a  Create Dataset from text file(s).

        Args:
            path_or_paths (`path-like` or list of `path-like`):
                Path(s) of the text file(s).
            split (`NamedSplit`, *optional*):
                Split name to be assigned to the dataset.
            features (`Features`, *optional*):
                Dataset features.
            cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`):
                Directory to cache data.
            keep_in_memory (`bool`, defaults to `False`):
                Whether to copy the data in-memory.
            num_proc (`int`, *optional*, defaults to `None`):
                Number of processes when downloading and generating the dataset locally.
                This is helpful if the dataset is made of multiple files. Multiprocessing is disabled by default.

                <Added version="2.8.0"/>
            **kwargs (additional keyword arguments):
                Keyword arguments to be passed to [`TextConfig`].

        Returns:
            [`Dataset`]

        Example:

        ```py
        >>> ds = Dataset.from_text('path/to/dataset.txt')
        ```
        r   )TextDatasetReaderr  )io.textr  r  )r  rn   r   r  r  r  r<  r  s           rt   	from_textzDataset.from_text  s]    R 	/.....  
)
 
 
 
 $&&	rv   Tzpyspark.sql.DataFrameload_from_cache_filec                     ddl m} t          j        dk    rt	          d           || f||||d|                                S )a  Create Dataset from Spark DataFrame. Dataset downloading is distributed over Spark workers.

        Args:
            df (`pyspark.sql.DataFrame`):
                The DataFrame containing the desired data.
            split (`NamedSplit`, *optional*):
                Split name to be assigned to the dataset.
            features (`Features`, *optional*):
                Dataset features.
            cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`):
                Directory to cache data. When using a multi-node Spark cluster, the cache_dir must be accessible to both
                workers and the driver.
            load_from_cache_file (`bool`):
                Whether to load the dataset from the cache if possible.

        Returns:
            [`Dataset`]

        Example:

        ```py
        >>> df = spark.createDataFrame(
        >>>     data=[[1, "Elia"], [2, "Teo"], [3, "Fang"]],
        >>>     columns=["id", "name"],
        >>> )
        >>> ds = Dataset.from_spark(df)
        ```
        r   )SparkDatasetReaderwin32z9Datasets.from_spark is not currently supported on Windows)rn   r   r  r  )io.sparkr  r  platformEnvironmentErrorr  )r  rn   r   r  r  r<  r  s          rt   
from_sparkzDataset.from_spark  sv    L 	100000<7"""#^___!!
!5
 
 
 
 $&&	rv   sqlzsqlalchemy.sql.Selectableconzsqlalchemy.engine.Connectionzsqlalchemy.engine.Enginezsqlite3.Connectionc                 N    ddl m}  || |f|||d|                                S )aX  Create Dataset from SQL query or database table.

        Args:
            sql (`str` or `sqlalchemy.sql.Selectable`):
                SQL query to be executed or a table name.
            con (`str` or `sqlite3.Connection` or `sqlalchemy.engine.Connection` or `sqlalchemy.engine.Connection`):
                A [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) used to instantiate a database connection or a SQLite3/SQLAlchemy connection object.
            features ([`Features`], *optional*):
                Dataset features.
            cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`):
                Directory to cache data.
            keep_in_memory (`bool`, defaults to `False`):
                Whether to copy the data in-memory.
            **kwargs (additional keyword arguments):
                Keyword arguments to be passed to [`SqlConfig`].

        Returns:
            [`Dataset`]

        Example:

        ```py
        >>> # Fetch a database table
        >>> ds = Dataset.from_sql("test_data", "postgres:///db_name")
        >>> # Execute a SQL query on the table
        >>> ds = Dataset.from_sql("SELECT sentence FROM test_data", "postgres:///db_name")
        >>> # Use a Selectable object to specify the query
        >>> from sqlalchemy import select, text
        >>> stmt = select([text("sentence")]).select_from(text("test_data"))
        >>> ds = Dataset.from_sql(stmt, "postgres:///db_name")
        ```

        <Tip>

        The returned dataset can only be cached if `con` is specified as URI string.

        </Tip>
        r   )SqlDatasetReader)r   r  r  )io.sqlr  r  )r  r  r   r  r  r<  r  s          rt   from_sqlzDataset.from_sql  s\    ^ 	-,,,,,
 )
 
 
 
 $&&	rv   c                 R    t          | d          r| `t          | d          r| `d S d S )Nr  r  )hasattrr  r  rx   s    rt   r  zDataset.__del__;  s>    4!! 	
4$$ 		 	rv   c                     | S rp   r   rx   s    rt   	__enter__zDataset.__enter__A  s    rv   c                 .    |                                   d S rp   )r  )rs   exc_typeexc_valexc_tbs       rt   __exit__zDataset.__exit__D  s    rv   
deprecateddataset_pathmax_shard_size
num_shardsstorage_optionsc                 
    |t          d          |dk    r!t          j        dt                     |j        V                                 }t          |pt          j                  }t          ||z            dz   t          |pd          ||nd}n|t          j                  }|d         }t          |           }	|	rt          j        j        nt"          j                                         rt          d          |	rt'                                                                        d	d	
           d  j        D             }
t'                                                    |
v r2t/          dt'                                                     d          n|                    d	            fddD             } j        t5           j                  n j        |d<   fdt7                    D             |d<   |d                                         D ]Y}	 t;          j        |d         |                    $# t>          $ r)}t?          t5          |          d| dz             dd}~ww xY wtA           j!                  d}tE          j#        tE          j$                     dtK                     dd| d d          } fdt7                    D             }dgz  }dgz  }|dk    rtM          |          5 }|5  tO          |tP          j)        |          D ]k\  }}}|rN|dz  }|*                    d| d d           tV          ,                    d| d  d!           |\  ||<   ||<   V|-                    |           l	 ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   n|5  |D ]}tQ          j)        d)i |D ]k\  }}}|rN|dz  }|*                    d| d d           tV          ,                    d| d  d!           |\  ||<   ||<   V|-                    |           l	 ddd           n# 1 swxY w Y   |.                     t          j/                  d"d#$          5 }t;          j0        ||d%d	&           ddd           n# 1 swxY w Y   |.                     t          j1                  d"d#$          5 }fd'te                    D             }t;          j0        ||d%(           ddd           dS # 1 swxY w Y   dS )*aN  
        Saves a dataset to a dataset directory, or in a filesystem using either `s3fs.S3FileSystem` or
        any implementation of `fsspec.spec.AbstractFileSystem`.

        For [`Image`] and [`Audio`] data:

        All the Image() and Audio() data are stored in the arrow files.
        If you want to store paths or urls, please use the Value("string") type.

        Args:
            dataset_path (`str`):
                Path (e.g. `dataset/train`) or remote URI (e.g. `s3://my-bucket/dataset/train`)
                of the dataset directory where the dataset will be saved to.
            fs (`fsspec.spec.AbstractFileSystem`, *optional*):
                Instance of the remote filesystem where the dataset will be saved to.

                <Deprecated version="2.8.0">

                `fs` was deprecated in version 2.8.0 and will be removed in 3.0.0.
                Please use `storage_options` instead, e.g. `storage_options=fs.storage_options`

                </Deprecated>

            max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`):
                The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit
                (like `"50MB"`).
            num_shards (`int`, *optional*):
                Number of shards to write. By default the number of shards depends on `max_shard_size` and `num_proc`.

                <Added version="2.8.0"/>
            num_proc (`int`, *optional*):
                Number of processes when downloading and generating the dataset locally.
                Multiprocessing is disabled by default.

                <Added version="2.8.0"/>
            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.8.0"/>

        Example:

        ```py
        >>> ds.save_to_disk("path/to/dataset/directory")
        >>> ds.save_to_disk("path/to/dataset/directory", max_shard_size="1GB")
        >>> ds.save_to_disk("path/to/dataset/directory", num_shards=1024)
        ```
        NXFailed to push_to_hub: please specify either max_shard_size or num_shards, but not both.r  'fs' was deprecated in favor of 'storage_options' in version 2.8.0 and will be removed in 3.0.0.
You can remove this warning by passing 'storage_options=fs.storage_options' instead.r   r  r   zPplease remove all the indexes using `dataset.drop_index` before saving a datasetT)parentsexist_okc                 f    h | ].}t          |d                                                    j        /S r  )r   resolveparentr   cache_filenames     rt   r   z'Dataset.save_to_disk.<locals>.<setcomp>  sA     ( ( (FT^J/0088::A( ( (rv   zTried to overwrite z& but a dataset can't overwrite itself.)r  c                 ,    i | ]}|j         |         S r   )__dict__)r   r   rs   s     rt   r   z(Dataset.save_to_disk.<locals>.<dictcomp>  s2     	
 	
 	
 s#	
 	
 	
rv   )r9  r4  r6  r5  r7  rr   c                 *    g | ]}d d|ddddiS )r  data-05d-of-.arrowr   )r   	shard_idxr  s     rt   r   z(Dataset.save_to_disk.<locals>.<listcomp>  sE      
  
  
PYZJJJJJJJJJK 
  
  
rv   _data_filesr6  z7
The format kwargs must be JSON serializable, but key 'z' isn't.	 examplesFzSaving the dataset (/z shards)disableunittotalleavedescc              3   x   K   | ]4}|                     |d            d|dddd          dV  5dS )Tr  ru  
contiguousr  r  r  r  )job_idshardfpathr  Nr	  )r   r  r  r  	path_joinrs   r  s     rt   rP  z'Dataset.save_to_disk.<locals>.<genexpr>  s       
 
  $zW[\\"<1b1b1b1bJ1b1b1b1bcc#2	 
 
 
 
 
 
rv   kwargs_iterablezFinished writing shard number  of rg  wutf-8encoding   )indent	sort_keysc                 "    i | ]}||         S r   r   )r   r   dataset_infos     rt   r   z(Dataset.save_to_disk.<locals>.<dictcomp>  s     '_'_'_3\#->'_'_'_rv   )r  r   )3r   r  r  r  r  _estimate_nbytesr_   r    MAX_SHARD_SIZEr   maxfsspecget_fs_token_pathsr2   ospathjoin	posixpathlist_indexesr   r  mkdircache_filesPermissionErrormakedirsrn   r   r   r   r_  rb  ri  r^   rq   rW   tqdmis_progress_bar_enabledr   r   r`   r   _save_to_disk_singleset_descriptionr  debugupdateopenDATASET_STATE_JSON_FILENAMEdumpDATASET_INFO_FILENAMEr8  )rs   r  fsr  r  r  r  dataset_nbytesfs_token_pathsis_localparent_cache_files_pathsstaterO  r  shards_donepbarkwargs_per_jobshard_lengthsshard_sizespoolr  donecontentr<  
state_filedataset_info_filesorted_keys_dataset_infor  r  s   ``  ` `                    @@rt   save_to_diskzDataset.save_to_diskH  s   r %**@j   Mg  
 !0O!2244N5n6]H]^^N^n<==AJZQ77J'388#-#9ZZx
2<Q`aaa(6q(9+B///$,@BGLL).	 	qoppp 	5&&((..td.KKK( (X\Xh( ( ($ L!!))++/GGG%n$|*<*<*D*D*F*Fnnn   H
 KKtK444	
 	
 	
 	
	
 	
 	
 .2Z-C#dj///h 
  
  
  
]bcm]n]n 
  
  
m '(--// 	 	A
5!12156666   FFcXYcccc 
 dj))|7999d))JJJjJJJ
 
 

 
 
 
 
 
 
 
 #:..
 
 
 +fz)a<<h 14 
1 
11Cg:N2 2 2 	1 	1-g   1'1,K 001j1j1jV`1j1j1jkkk"LL)c&)c)cV`)c)c)cdddIPFM&1;v3F3F KK0000	1
1 
1 
1 
1 
1 
1 
1 
1 
1 
1 
1 
1 
1 
1 
11 1 1 1 1 1 1 1 1 1 1 1 1 1 1  	1 	1, 1 1F181M1W1WPV1W1W 1 1-g 1'1,K 001j1j1jV`1j1j1jkkk"LL)c&)c)cV`)c)c)cdddIPFM&1;v3F3F KK000011	1 	1 	1 	1 	1 	1 	1 	1 	1 	1 	1 	1 	1 	1 	1 WWYY|V-OPPRU`gWhh 	ClvIeZTBBBB	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	CWWIlF$@AA3QX  
 
 	M'_'_'_'_&Q]J^J^'_'_'_$I.0A!LLLL	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	Ms   9 I
J$$JJ%O(B	N>2O>O	OO	OOO#BQ33Q7:Q7*SSS3UU
U
r  r	  r
  c              #     K   t           j        }d}t          |j        ||d          }	 t	          j                    }|                    d                              |          D ]i}|                    |           |t          |          z  }t	          j                    |t           j	        z   k    rt	          j                    }| d|fV  d}j	 | d|fV  |
                                \  }	}
|                                 n7# | d|fV  |
                                \  }	}
|                                 w xY w| d|	|
ffV  d S )Nr   T)r   r  r  embed_local_filesarrowF)r    DEFAULT_MAX_BATCH_SIZEr"   r   timer!  iterwrite_tabler   PBAR_REFRESH_TIME_INTERVALfinalizeclose)r  r	  r
  r  r   num_examples_progress_updatewriter_timer  num_examples	num_bytess              rt   r)  zDataset._save_to_disk_single  sn     2
'($^+"	
 
 
	IKKE!--g66;;JGG 5 5""8,,,,H=,9;;)J!JJJ IKKE %)EEEEE3405 %!=====&,oo&7&7#L)LLNNNN %!=====&,oo&7&7#L)LLNNNNd\95555555s   B%D 4D7uri_or_pathc                     t          |           }t                      }t          ||                    |j                            S )a  
        Builds and returns a Path concatenating a local temporary dir with the dir path (or absolute/relative
        path extracted from the uri) passed.

        Args:
            uri_or_path (`str`): Path (e.g. `"dataset/train"`) or remote URI (e.g.
                `"s3://my-bucket/dataset/train"`) to concatenate.

        Returns:
            :class:`Path`: the concatenated path (temp dir + path)
        )r   r8   relative_toanchor)rR  src_dataset_pathtmp_dirs      rt   _build_local_temp_pathzDataset._build_local_temp_path  s@      ,,577G-99:J:QRRSSSrv   c                    |dk    r!t          j        dt                     |j        }t	          j        | |          }|d         }t          |          rt          |           t          j	        n't	          j
        d          }| t          j        j	         t          j                  } t          j                  } t          j                  }|                    |          }|                    |          }	|                    |          }
|	s0|
s.|rt%          d| d| d          t%          d| d| d	          |	s(|rt%          d
| d          t%          d
| d          |
s(|rt%          d
| d          t%          d
| d          t          |          rr}t&                              |          |                    |                                d            t          j                  } t          j                  }t/          |d          5 }t1          j        |          }ddd           n# 1 swxY w Y   t/          |d          5 }t5          j        t1          j        |                    }ddd           n# 1 swxY w Y   t9          fd|d         D                       }||nt;          |          }|rt<          nt>          tA          fd|d         D                       }|d         }|tC          |          n|}t'          ||||d                   }|d         |d         |d         |d         d} |j"        di |}|S )a	  
        Loads a dataset that was previously saved using [`save_to_disk`] from a dataset directory, or from a
        filesystem using either `s3fs.S3FileSystem` or any implementation of
        `fsspec.spec.AbstractFileSystem`.

        Args:
            dataset_path (`str`):
                Path (e.g. `"dataset/train"`) or remote URI (e.g. `"s3//my-bucket/dataset/train"`)
                of the dataset directory where the dataset will be loaded from.
            fs (`fsspec.spec.AbstractFileSystem`, *optional*):
                Instance of the remote filesystem where the dataset will be saved to.

                <Deprecated version="2.8.0">

                `fs` was deprecated in version 2.8.0 and will be removed in 3.0.0.
                Please use `storage_options` instead, e.g. `storage_options=fs.storage_options`

                </Deprecated>

            keep_in_memory (`bool`, defaults to `None`):
                Whether to copy the dataset in-memory. If `None`, the
                dataset will not be copied in-memory unless explicitly enabled by setting
                `datasets.config.IN_MEMORY_MAX_SIZE` to nonzero. See more details in the
                [improve performance](../cache#improve-performance) section.
            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.8.0"/>

        Returns:
            [`Dataset`] or [`DatasetDict`]:
            - If `dataset_path` is a path of a dataset directory, the dataset requested.
            - If `dataset_path` is a path of a dataset dict directory, a `datasets.DatasetDict` with each split.

        Example:

        ```py
        >>> ds = load_from_disk("path/to/dataset/directory")
        ```
        r  r  r  r   filezNo such files: 'z', nor 'z' found. Expected to load a `Dataset` object, but got a `DatasetDict`. Please use either `datasets.load_from_disk` or `DatasetDict.load_from_disk` instead.zR' found. Expected to load a `Dataset` object but provided path is not a `Dataset`.zNo such file: 'zL'. Expected to load a `Dataset` object but provided path is not a `Dataset`.T)	recursiver  r  Nc              3   D   K   | ]}t          |d                    V  dS r  Nr   )r   	data_filedest_dataset_paths     rt   rP  z)Dataset.load_from_disk.<locals>.<genexpr>  sD       -
 -
?HD"Ij$9::-
 -
 -
 -
 -
 -
rv   r  c              3   b   K   | ])}                      |d                              V  *dS r]  )r  )r   r^  r_  r  	table_clss     rt   rP  z)Dataset.load_from_disk.<locals>.<genexpr>  sW       $
 $
 		*;Yz=R S STT$
 $
 $
 $
 $
 $
rv   rr   r9  )r{  rm   rn   rC  r5  r6  r4  r7  r0  r   )#r  r  r  r  r  r  r2   r1   r!  r   
filesystemr  r  r    DATASETDICT_JSON_FILENAMEr.  r0  isfileFileNotFoundErrorr   rX  downloadas_posixr-  r_  loadrD   r  rZ   r\   rL   rM   rP   rI   r!  )r  r1  r  r  r3  dataset_dict_json_pathdataset_state_json_pathdataset_info_pathdataset_dict_is_filedataset_info_is_filedataset_state_is_filerV  r?  r6  r@  r  r   r{  rn   r   r   r_  r  ra  s                        @@@rt   load_from_diskzDataset.load_from_disk$  sO   ^ Mg  
 !0O2<Q`aaa(6q(9## 	% 5l C C!II"6**B ,I!*+<f>^!_!_"+),=v?a"b"b%I&79UVV!yy)?@@!yy):;; "		*A B B# 	,A 	# ' w'8  w  wBY  w  w  w   $ j#4  j  j>U  j  j  j   $ 	# ' U&7  U  U  U   $ B"3  B  B  B   % 	# ' [&=  [  [  [   $ H"9  H  H  H  
  ## 	[0 ' > >?O P PKK(*;*D*D*F*FRVKWWW&/i0A6Ce&f&f# )	*;V=Y Z Z)G<<< 	*
Ij))E	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	*#g666 	O:K&0;L1M1MNNL	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O - -
 -
 -
 -
LQR_L`-
 -
 -
 
 
 ,:+EK[\hKiKi%3JMM9J	# $
 $
 $
 $
 $
 $
"=1$
 $
 $
 
 

 h % 1eu#n-	
 
 
 .)"#34./"'(=">	
 
 &'%////s$   4III1'J$$J(+J(c                     | j         S )a	  The Apache Arrow table backing the dataset.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.data
        MemoryMappedTable
        text: string
        label: int64
        ----
        text: [["compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .","the soundtrack alone is worth the price of admission .","rodriguez does a splendid job of racial profiling hollywood style--casting excellent latin actors of all ages--a trend long overdue .","beneath the film's obvious determination to shock at any cost lies considerable skill and determination , backed by sheer nerve .","bielinsky is a filmmaker of impressive talent .","so beautifully acted and directed , it's clear that washington most certainly has a new career ahead of him if he so chooses .","a visual spectacle full of stunning images and effects .","a gentle and engrossing character study .","it's enough to watch huppert scheming , with her small , intelligent eyes as steady as any noir villain , and to enjoy the perfectly pitched web of tension that chabrol spins .","an engrossing portrait of uncompromising artists trying to create something original against the backdrop of a corporate music industry that only seems to care about the bottom line .",...,"ultimately , jane learns her place as a girl , softens up and loses some of the intensity that made her an interesting character to begin with .","ah-nuld's action hero days might be over .","it's clear why deuces wild , which was shot two years ago , has been gathering dust on mgm's shelf .","feels like nothing quite so much as a middle-aged moviemaker's attempt to surround himself with beautiful , half-naked women .","when the precise nature of matthew's predicament finally comes into sharp focus , the revelation fails to justify the build-up .","this picture is murder by numbers , and as easy to be bored by as your abc's , despite a few whopping shootouts .","hilarious musical comedy though stymied by accents thick as mud .","if you are into splatter movies , then you will probably have a reasonably good time with the salton sea .","a dull , simple-minded and stereotypical tale of drugs , death and mind-numbing indifference on the inner-city streets .","the feature-length stretch . . . strains the show's concept ."]]
        label: [[1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0]]
        ```
        r  rx   s    rt   r#  zDataset.data  s    $ zrv   c                 ~    t          | j                  }| j        |t          | j                  z  }d |D             S )a  The cache files containing the Apache Arrow table backing the dataset.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.cache_files
        [{'filename': '/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46/rotten_tomatoes_movie_review-validation.arrow'}]
        ```
        Nc                     g | ]}d |iS r  r   r  s     rt   r   z'Dataset.cache_files.<locals>.<listcomp>  s    OOO^,OOOrv   )rR   r  r  )rs   r$  s     rt   r$  zDataset.cache_files  sD     -TZ88=$1$-@@@KOO;OOOOrv   c                     | j         j        S )zNumber of columns in the dataset.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.num_columns
        2
        ```
        )r  num_columnsrx   s    rt   ru  zDataset.num_columns  s     z%%rv   c                 @    | j         | j         j        S | j        j        S )a
  Number of rows in the dataset (same as [`Dataset.__len__`]).

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.num_rows
        1066
        ```
        )r  num_rowsr  rx   s    rt   rw  zDataset.num_rows  s"     =$=))z""rv   c                     | j         j        S )a  Names of the columns in the dataset.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.column_names
        ['text', 'label']
        ```
        r  r3  rx   s    rt   r3  zDataset.column_names  s     z&&rv   c                 X    | j         | j         j        | j        j        fS | j        j        S )a	  Shape of the dataset (number of columns, number of rows).

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.shape
        (1066, 2)
        ```
        )r  rw  r  ru  r   rx   s    rt   r   zDataset.shape  s,     =$M*DJ,BCCzrv   r   c                 J   || j         j        vr t          d| d| j         j         d          | j        /| j        j        | j         j        k    r|                                 }n| }|j                             |                                                                          S )a5  Return a list of the unique elements in a column.

        This is implemented in the low-level backend and as such, very fast.

        Args:
            column (`str`):
                Column name (list all the column names with [`~datasets.Dataset.column_names`]).

        Returns:
            `list`: List of unique elements in the given column.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.unique('label')
        [1, 0]
        ```
        Column () not in table columns ().)	r  r3  r   r  rw  flatten_indicesr   unique	to_pylist)rs   r   r   s      rt   r  zDataset.unique  s    * 000ccc
H_cccddd=$)?4:CV)V)V**,,GGG}##F++2244>>@@@rv   include_nullsc           
        	 | j         j        vr t          d d| j         j         d          | j        j                 }t          |t                    s5t          dt          j         d dt          |          j         d          |j	        dk    sr6d	| 
                              v rfd
}|                     |dd          }n| }t          fd|
                              D                       }t          |          		fd}|j                                        }	|<   |                    |d|d          }|S )a4  Casts the given column as [`~datasets.features.ClassLabel`] and updates the table.

        Args:
            column (`str`):
                The name of the column to cast (list all the column names with [`~datasets.Dataset.column_names`])
            include_nulls (`bool`, defaults to `False`):
                Whether to include null values in the class labels. If `True`, the null values will be encoded as the `"None"` class label.

                <Added version="1.14.2"/>

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("boolq", split="validation")
        >>> ds.features
        {'answer': Value(dtype='bool', id=None),
         'passage': Value(dtype='string', id=None),
         'question': Value(dtype='string', id=None)}
        >>> ds = ds.class_encode_column('answer')
        >>> ds.features
        {'answer': ClassLabel(num_classes=2, names=['False', 'True'], id=None),
         'passage': Value(dtype='string', id=None),
         'question': Value(dtype='string', id=None)}
        ```
        r|  r}  r~  z%Class encoding is only supported for  column, and column  is rg  r   Nc                 6    fd|          D             | <   | S )Nc                 :    g | ]}s|t          |          nd S rp   r   r   r   r  s     rt   r   zIDataset.class_encode_column.<locals>.stringify_column.<locals>.<listcomp>]  s:     ! ! !U[=PF4FCKKKD! ! !rv   r   )r   r   r  s    rt   stringify_columnz5Dataset.class_encode_column.<locals>.stringify_column\  s<    ! ! ! !_dek_l! ! !f rv   TzStringifying the column)batchedr  c              3   >   K   | ]}s|t          |          V  d S rp   r  r  s     rt   rP  z.Dataset.class_encode_column.<locals>.<genexpr>k  s4      rrVmr_e_qS[[_q_q_q_qrrrv   namesc                 8    fd|          D             | <   | S )Nc                 `    g | ]*}s|"                     t          |                    nd +S rp   )str2intr   )r   r   dst_featr  s     rt   r   zMDataset.class_encode_column.<locals>.cast_to_class_labels.<locals>.<listcomp>o  sM        2?^&BT  V---Z^  rv   r   )r   r   r  r  s    rt   cast_to_class_labelsz9Dataset.class_encode_column.<locals>.cast_to_class_labelsn  s?        #Fm  E&M Lrv   zCasting to class labels)r  r   r  )r  r3  r   rq   r   r   r*   r   r   r   r  r"  r8  r'   r   )
rs   r   r  src_featr  dsetclass_namesr  new_featuresr  s
    ``      @rt   class_encode_columnzDataset.class_encode_column6  s   8 000ccc
H_cccddd:&v.(E** 	 C  C  C\b  C  Chlmuhvhvh  C  C  C   >X%%-%DDKKPVDWDW<W<W      88 .   DD D rrrrt{{67J7JrrrrrK000	 	 	 	 	 	 	 }))++'Vxx !*	  
 
 rv   )inplace   new_fingerprintc                    t          j        |           }t          d|          D ]D}t          d |j        j        D                       r|j                                        |_        D | j        j                            |          |j	        _        t          |j        |j                  |_        t          	                    d| d|dz   |k     rdnd d           ||_        |S )a  Flatten the table.
        Each column with a struct type is flattened into one column per struct field.
        Other columns are left unchanged.

        Args:
            new_fingerprint (`str`, *optional*):
                The new fingerprint of the dataset after transform.
                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.

        Returns:
            [`Dataset`]: A copy of the dataset with flattened columns.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("squad", split="train")
        >>> ds.features
        {'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
         'context': Value(dtype='string', id=None),
         'id': Value(dtype='string', id=None),
         'question': Value(dtype='string', id=None),
         'title': Value(dtype='string', id=None)}
        >>> ds.flatten()
        Dataset({
            features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],
            num_rows: 87599
        })
        ```
        r   c              3   T   K   | ]#}t          |j        t          j                  V  $d S rp   )r   r   rh  
StructType)r   r  s     rt   rP  z"Dataset.flatten.<locals>.<genexpr>  s0      [[U:ej"-88[[[[[[rv   )	max_depthzFlattened dataset from depth z
 to depth unknownrg  )r   r   r   anyr  r\  flattenrq   r   rm   re  r  r9  )rs   r  r  r   depths        rt   r  zDataset.flatten  s    @ -%%1i(( 	 	E[[gmFZ[[[[[  ' 5 5 7 7 $
 3 ; ;i ; P P5gmWEUVVqEqq%RS)V_J_J_QQenqqqrrr.rv     r   cache_file_namewriter_batch_sizec                 x   t          |          t          | j        j                  k    r,t          dt	          |           d| j        j                   |j        }| j        }	|                     d          }
|
                    t          t          |          d|||||||d
  
        }
 |
j        di |	}
|
S )	ar
  
        Cast the dataset to a new set of features.

        Args:
            features ([`Features`]):
                New features to cast the dataset to.
                The name of the fields in the features must match the current column names.
                The type of the data must also be convertible from one type to the other.
                For non-trivial conversion, e.g. `str` <-> `ClassLabel` you should use [`~datasets.Dataset.map`] to update the Dataset.
            batch_size (`int`, defaults to `1000`):
                Number of examples per batch provided to cast.
                If `batch_size <= 0` or `batch_size == None` then provide the full dataset as a single batch to cast.
            keep_in_memory (`bool`, defaults to `False`):
                Whether to copy the data in-memory.
            load_from_cache_file (`bool`, defaults to `True` if caching is enabled):
                If a cache file storing the current computation from `function`
                can be identified, use it instead of recomputing.
            cache_file_name (`str`, *optional*, defaults to `None`):
                Provide the name of a path for the cache file. It is used to store the
                results of the computation instead of the automatically generated cache file name.
            writer_batch_size (`int`, defaults to `1000`):
                Number of rows per write operation for the cache file writer.
                This value is a good trade-off between memory usage during the processing, and processing speed.
                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running [`~datasets.Dataset.map`].
            num_proc (`int`, *optional*, defaults to `None`):
                Number of processes for multiprocessing. By default it doesn't
                use multiprocessing.

        Returns:
            [`Dataset`]: A copy of the dataset with casted features.

        Example:

        ```py
        >>> from datasets import load_dataset, ClassLabel, Value
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.features
        {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
         'text': Value(dtype='string', id=None)}
        >>> new_features = ds.features.copy()
        >>> new_features['label'] = ClassLabel(names=['bad', 'good'])
        >>> new_features['text'] = Value('large_string')
        >>> ds = ds.cast(new_features)
        >>> ds.features
        {'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None),
         'text': Value(dtype='large_string', id=None)}
        ```
        zThe columns in features (z3) must be identical as the columns in the dataset: rE  r\  TzCasting the dataset)	r  r   r  r  r  r  r  r   r  r   )r8  r  r3  r   r   r  r   r!  r"  r   rS   )rs   r   r   r  r  r  r  r  r\  r   r   s              rt   r  zDataset.cast  s    t (vdj&=>>>>LDNN L L26*2IL L  
 &""7++++Jv...!)!5+/&  
 
 &'%////rv   featurec                 L   t          |d          rtt          j        |           }||j        j        |<   ||_        |j                            |j        j                  |_        t          |j        |j                  |_        |S | j        }|||<   |                     |          S )a  Cast column to feature for decoding.

        Args:
            column (`str`):
                Column name.
            feature (`FeatureType`):
                Target feature.
            new_fingerprint (`str`, *optional*):
                The new fingerprint of the dataset after transform.
                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.

        Returns:
            [`Dataset`]

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.features
        {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
         'text': Value(dtype='string', id=None)}
        >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
        >>> ds.features
        {'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None),
         'text': Value(dtype='string', id=None)}
        ```
        decode_example)
r  r   r   rq   r   r9  r  r  r  re  )rs   r   r  r  r   r   s         rt   cast_columnzDataset.cast_column   s    < 7,-- 
	'mD))G-4GM"6*#2G #M..w/?/LMMGM9'-IYZZGMN}H&HV99X&&&rv   r3  c                 p   t          j        |           }t          |t                    r|g}|D ]/}||j        j        vrt          d| d|j        j                   0|D ]}|j        j        |= |j        	                    |          |_        t          |j        |j                  |_        ||_        |S )a  
        Remove one or several column(s) in the dataset and the features associated to them.

        You can also remove a column using [`~datasets.Dataset.map`] with `remove_columns` but the present method
        is in-place (doesn't copy the data to a new dataset) and is thus faster.

        Args:
            column_names (`Union[str, List[str]]`):
                Name of the column(s) to remove.
            new_fingerprint (`str`, *optional*):
                The new fingerprint of the dataset after transform.
                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.

        Returns:
            [`Dataset`]: A copy of the dataset object without the columns to remove.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.remove_columns('label')
        Dataset({
            features: ['text'],
            num_rows: 1066
        })
        >>> ds.remove_columns(column_names=ds.column_names) # Removing all the columns returns an empty dataset with the `num_rows` property set to 0
        Dataset({
            features: [],
            num_rows: 0
        })
        ```
        Column name 5 not in the dataset. Current columns in the dataset: )r   r   r   r   r  r3  r   rq   r   dropre  r9  )rs   r3  r  r   column_names        rt   remove_columnszDataset.remove_columns*  s    J -%%lC(( 	*(>L' 	 	K'-"<<< T; T T7>}7QT T   = ( 	4 	4K&{33**<885gmWEUVV.rv   original_column_namenew_column_namec                    t          j        |           }|j        j        vrt	          d d|j        j                   |j        j        v rt	          d d|j        j                   st	          d          fd} || j        j                  }| j         || j                  |_        t          fd| j        j        	                                D                       |j        _        |j        
                    |          |_        t          |j        |j                  |_        ||_        |S )	a  
        Rename a column in the dataset, and move the features associated to the original column under the new column
        name.

        Args:
            original_column_name (`str`):
                Name of the column to rename.
            new_column_name (`str`):
                New name for the column.
            new_fingerprint (`str`, *optional*):
                The new fingerprint of the dataset after transform.
                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.

        Returns:
            [`Dataset`]: A copy of the dataset with a renamed column.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.rename_column('label', 'label_new')
        Dataset({
            features: ['text', 'label_new'],
            num_rows: 1066
        })
        ```
        zOriginal column name r  zNew column name zz already in the dataset. Please choose a column name which is not already in the dataset. Current columns in the dataset: zNew column name is empty.c                 "    fd| D             S )Nc                 $    g | ]}|k    rn|S r   r   )r   r  r  r  s     rt   r   z9Dataset.rename_column.<locals>.rename.<locals>.<listcomp>  s)    ___PSs.B'B'BOO___rv   r   )r   r  r  s    rt   renamez%Dataset.rename_column.<locals>.rename  s!    _____W^____rv   Nc                 ,    i | ]\  }}|k    rn||S r   r   )r   r  r  r  r  s      rt   r   z)Dataset.rename_column.<locals>.<dictcomp>  s>        C $'*>#>#>C  rv   )r   r   r  r3  r   r4  r(   rq   r   r   rename_columnsre  r9  )rs   r  r  r  r   r  new_column_namess    ``    rt   rename_columnzDataset.rename_columnb  s   B -%%w}'AAAP(< P P3:=3MP P   gm888P? P P3:=3MP P  
  	:8999	` 	` 	` 	` 	` 	` "6$*"9::+&,fT-A&B&BG#!)    $(J$7$=$=$?$?  "
 "
  445EFF5gmWEUVV.rv   rR  c                    t          j        |           }t                                                    t          |j                  z
  }|rt          d| d|j        j                   t                                                    t          t                                                              z
  }|dk    rt          d| d          d                                 D             }|rt          d| d          fd	} || j        j                  }| j	         || j	                  |_	        t          fd| j        j        pi                                 D                       |j        _        |j                            |          |_        t          |j        |j                  |_        ||_        |S )a  
        Rename several columns in the dataset, and move the features associated to the original columns under
        the new column names.

        Args:
            column_mapping (`Dict[str, str]`):
                A mapping of columns to rename to their new names
            new_fingerprint (`str`, *optional*):
                The new fingerprint of the dataset after transform.
                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.

        Returns:
            [`Dataset`]: A copy of the dataset with renamed columns

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.rename_columns({'text': 'text_new', 'label': 'label_new'})
        Dataset({
            features: ['text_new', 'label_new'],
            num_rows: 1066
        })
        ```
        zOriginal column names r  r   zDNew column names must all be different, but this column mapping has z duplicatesc                     g | ]}||S r   r   )r   new_cols     rt   r   z*Dataset.rename_columns.<locals>.<listcomp>  s    [[[SZ[W[[[rv   zNew column names z are empty.c                      fd| D             S )Nc                 ,    g | ]}|v r|         n|S r   r   )r   r  rR  s     rt   r   z:Dataset.rename_columns.<locals>.rename.<locals>.<listcomp>  s-    ]]]c3.+@+@N3''c]]]rv   r   )r   rR  s    rt   r  z&Dataset.rename_columns.<locals>.rename  s    ]]]]U\]]]]rv   Nc                 4    i | ]\  }}|v r|         n||S r   r   )r   r  r  rR  s      rt   r   z*Dataset.rename_columns.<locals>.<dictcomp>  sC        C (+n'<'<s###w  rv   )r   r   r   r   r3  r   r  r   r  r4  r(   rq   r   r   r  re  r9  )	rs   rR  r  r   extra_columns#number_of_duplicates_in_new_columnsempty_new_columnsr  r  s	    `       rt   r  zDataset.rename_columns  s   : -%%N//1122S9M5N5NN 	P P P3:=3MP P  
 /2.2G2G2I2I.J.JSQTUcUjUjUlUlQmQmMnMn.n+.!33H:H H H  
 \[N4I4I4K4K[[[ 	QO1BOOOPPP	^ 	^ 	^ 	^ 	^ "6$*"9::+&,fT-A&B&BG#!)   %)Z%8%>B$E$E$G$G  "
 "
  445EFF5gmWEUVV.rv   c                    t          t                    rgD ]0}|| j        j        vr t	          d| d| j        j         d          1t          j        |           }t          fd|j        j	        
                                D                       |j        _	        |j                                      |_        t          |j        |j	                  |_        ||_        |S )az  Select one or several column(s) in the dataset and the features
        associated to them.

        Args:
            column_names (`Union[str, List[str]]`):
                Name of the column(s) to keep.
            new_fingerprint (`str`, *optional*):
                The new fingerprint of the dataset after transform. If `None`,
                the new fingerprint is computed using a hash of the previous
                fingerprint, and the transform arguments.

        Returns:
            [`Dataset`]: A copy of the dataset object which only consists of
            selected columns.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.select_columns(['text'])
        Dataset({
            features: ['text'],
            num_rows: 1066
        })
        ```
        r  r  rg  c                 $    i | ]\  }}|v 	||S r   r   )r   rO  vr3  s      rt   r   z*Dataset.select_columns.<locals>.<dictcomp>	  s+    *p*p*pDAq^_co^o^o1a^o^o^orv   )r   r   r  r3  r   r   r   r(   rq   r   r   selectre  r9  )rs   r3  r  r  r   s    `   rt   select_columnszDataset.select_columns  s    > lC(( 	*(>L' 	 	K$*"999 2; 2 2z.2 2 2   : -%%!)*p*p*p*pGM<R<X<X<Z<Z*p*p*p!q!q,,\::5gmWEUVV.rv   c                     | j         S )aa  Number of rows in the dataset.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.__len__
        <bound method Dataset.__len__ of Dataset({
            features: ['text', 'label'],
            num_rows: 1066
        })>
        ```
        rw  rx   s    rt   __len__zDataset.__len__	  s     }rv   c           	   #     K   | j         | j        | j        ni }t          | j        fd| j        j        i|}t          j        }t          | j	        |          D ]Q}t          |j                  D ]:}|                    |d          }t          |d|| j        | j                  }|V  ;RdS t          | j                  D ]}|                     |          V  dS )zIterate through the examples.

        If a formatting is set with :meth:`Dataset.set_format` rows will be returned with the
        selected format.
        Nr   )r   r   r   	formatterformat_columnsr2  )r  r6  r@   r5  rq   r   r    'ARROW_READER_BATCH_SIZE_IN_DATASET_ITERrT   r#  r   rw  slicer>   r4  r7  _getitem)rs   r1  r  r   pa_subtabler   pa_subtable_exformatted_outputs           rt   __iter__zDataset.__iter__+	  s7      =  483F3RD//XZM%d&7gg$*BUgYfggIGJ)$)
KKK 
+ 
+{344 	+ 	+A%0%6%6q!%<%<N'3&"+'+';+/+C( ( ($ +****	+
+ 
+ 4=))  mm      rv   drop_last_batchc              #     K   | j         ~| j        | j        ni }t          | j        fd| j        j        i|}t          | j        ||          D ]6}t          |t          |j
                  || j        | j                  }|V  7dS |s| j
        n| j
        |z  |z  }t          d||          D ]*}|                     t          |||z                       V  +dS )a  Iterate through the batches of size `batch_size`.

        If a formatting is set with [`~datasets.Dataset.set_format`] rows will be returned with the
        selected format.

        Args:
            batch_size (:obj:`int`): size of each batch to yield.
            drop_last_batch (:obj:`bool`, default `False`): Whether a last batch smaller than the batch_size should be
                dropped
        Nr   )r   r  r  r   )r  r6  r@   r5  rq   r   rT   r#  r>   r   rw  r4  r7  r  r  )	rs   r   r  r1  r  r  formatted_batchrw  r   s	            rt   rH  zDataset.iterH	  s6      =  483F3RD//XZM%d&7gg$*BUgYfggI)$)
\klll & &".+.//'#'#7'+'?# # # &%%%%& & -<it}}R\A\_iAiH1h
33  mm!Q^,,      rv   c                 p    dt          | j        j                                                   d| j         dS )NzDataset({
    features: z,
    num_rows: z
}))r   rq   r   r   rw  rx   s    rt   __repr__zDataset.__repr__h	  s8    sD1D1I1I1K1K,L,Lss_c_lssssrv   c                 T    | j         | j        | j        | j        n| j        | j        dS )Nr0  )r5  r6  r4  r3  r7  rx   s    rt   r   zDataset.formatk	  s;     %!0,0,@,Ht((dNb"&":	
 
 	
rv   r   r2  c              +      K   | j         }| j        }| j        }| j        }	  | j        |||fi | dV   | j        |||fi | dS #  | j        |||fi | w xY w)aI  To be used in a `with` statement. Set `__getitem__` return format (type and columns).

        Args:
            type (`str`, *optional*):
                Output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'pandas', 'arrow', 'jax']`.
                `None` means `__getitem__`` returns python objects (default).
            columns (`List[str]`, *optional*):
                Columns to format in the output.
                `None` means `__getitem__` returns all columns (default).
            output_all_columns (`bool`, defaults to `False`):
                Keep un-formatted columns as well in the output (as python objects).
            **format_kwargs (additional keyword arguments):
                Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`.
        N)r5  r6  r4  r7  r:  )	rs   r   r   r2  r1  old_format_typeold_format_kwargsold_format_columnsold_output_all_columnss	            rt   formatted_aszDataset.formatted_ast	  s      , + /!1!%!9	nDOD'+=OOOOOEEEDOO-?AWmm[lmmmmmODOO-?AWmm[lmmmms   A Ac           	          |                     |                    di                      t          |          }t          |fd j        j        i| t          |t                    r|g}t          |t                    rt          |          }|Xt           fd|D                       r=t          dt          t           fd|                     d j        j                   ||                                }| _        | _        | _        | _        t(                              d|d	n||d
nt          |          |rdnd           dS )a  Set `__getitem__` return format (type and columns). The data formatting is applied on-the-fly.
        The format `type` (for example "numpy") is used to format batches when using `__getitem__`.
        It's also possible to use custom transforms for formatting using [`~datasets.Dataset.set_transform`].

        Args:
            type (`str`, *optional*):
                Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'pandas', 'arrow', 'jax']`.
                `None` means `__getitem__` returns python objects (default).
            columns (`List[str]`, *optional*):
                Columns to format in the output.
                `None` means `__getitem__` returns all columns (default).
            output_all_columns (`bool`, defaults to `False`):
                Keep un-formatted columns as well in the output (as python objects).
            **format_kwargs (additional keyword arguments):
                Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`.

        It is possible to call [`~datasets.Dataset.map`] after calling `set_format`. Since `map` may add new columns, then the list of formatted columns
        gets updated. In this case, if you apply `map` on a dataset to add a new column, then this column will be formatted as:

            ```
            new formatted columns = (all columns - previously unformatted columns)
            ```

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> from transformers import AutoTokenizer
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True)
        >>> ds.set_format(type='numpy', columns=['text', 'label'])
        >>> ds.format
        {'type': 'numpy',
        'format_kwargs': {},
        'columns': ['text', 'label'],
        'output_all_columns': False}
        ```
        r1  r   Nc              3   4   K   | ]}|j         j        vV  d S rp   ry  r   r  rs   s     rt   rP  z%Dataset.set_format.<locals>.<genexpr>	  s-      &]&]cs$*2I'I&]&]&]&]&]&]rv   zColumns c                      | j         j        vS rp   ry  r  rs   s    rt   <lambda>z$Dataset.set_format.<locals>.<lambda>	  s    3dj>U3U rv   r  z}Set __getitem__(key) output type to %s for %s columns  (when key is int or slice) and %s output other (un-formatted) columns.zpython objectsnodozdon't)r,  r   r?   r@   rq   r   r   r   tupler   r  r   filterr  r3  r   r5  r6  r4  r7  r  r+  )rs   r   r   r2  r1  s   `    rt   r:  zDataset.set_format	  s   ^ 	]..CCDDD *$//dJJTZ%8JMJJJ gs## 	 iGgu%% 	$7mmG3&]&]&]&]U\&]&]&]#]#] q4'U'U'U'UW^ _ _``  q  q  X\  Xb  Xo  q  q   llnnG +&#5 V $$ODDW&3DDG	
 	
 	
 	
 	
rv   c                 .    |                                   dS )a
  Reset `__getitem__` return format to python objects and all columns.

        Same as `self.set_format()`

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> from transformers import AutoTokenizer
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True)
        >>> ds.set_format(type='numpy', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
        >>> ds.format
        {'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
         'format_kwargs': {},
         'output_all_columns': False,
         'type': 'numpy'}
        >>> ds.reset_format()
        >>> ds.format
        {'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
         'format_kwargs': {},
         'output_all_columns': False,
         'type': None}
        ```
        Nr:  rx   s    rt   reset_formatzDataset.reset_format	  s    6 	rv   	transformc                 8    |                      d|||           dS )a
  Set `__getitem__` return format using this transform. The transform is applied on-the-fly on batches when `__getitem__` is called.
        As [`~datasets.Dataset.set_format`], this can be reset using [`~datasets.Dataset.reset_format`].

        Args:
            transform (`Callable`, *optional*):
                User-defined formatting transform, replaces the format defined by [`~datasets.Dataset.set_format`].
                A formatting function is a callable that takes a batch (as a `dict`) as input and returns a batch.
                This function is applied right before returning the objects in `__getitem__`.
            columns (`List[str]`, *optional*):
                Columns to format in the output.
                If specified, then the input batch of the transform only contains those columns.
            output_all_columns (`bool`, defaults to `False`):
                Keep un-formatted columns as well in the output (as python objects).
                If set to True, then the other un-formatted columns are kept with the output of the transform.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> from transformers import AutoTokenizer
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        >>> def encode(batch):
        ...     return tokenizer(batch['text'], padding=True, truncation=True, return_tensors='pt')
        >>> ds.set_transform(encode)
        >>> ds[0]
        {'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]),
         'input_ids': tensor([  101, 29353,  2135, 15102,  1996,  9428, 20868,  2890,  8663,  6895,
                 20470,  2571,  3663,  2090,  4603,  3017,  3008,  1998,  2037, 24211,
                 5637,  1998, 11690,  2336,  1012,   102]),
         'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                 0, 0])}
        ```
        r  )r   r2  r  Nr  )rs   r  r   r2  s       rt   set_transformzDataset.set_transform	  s'    R 	'FXdmnnnnnrv   c                 P    t          j        |           } |j        d|||d| |S )a  Set `__getitem__` return format (type and columns). The data formatting is applied on-the-fly.
        The format `type` (for example "numpy") is used to format batches when using `__getitem__`.

        It's also possible to use custom transforms for formatting using [`~datasets.Dataset.with_transform`].

        Contrary to [`~datasets.Dataset.set_format`], `with_format` returns a new [`Dataset`] object.

        Args:
            type (`str`, *optional*):
                Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'pandas', 'arrow', 'jax']`.
                `None` means `__getitem__` returns python objects (default).
            columns (`List[str]`, *optional*):
                Columns to format in the output.
                `None` means `__getitem__` returns all columns (default).
            output_all_columns (`bool`, defaults to `False`):
                Keep un-formatted columns as well in the output (as python objects).
            **format_kwargs (additional keyword arguments):
                Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> from transformers import AutoTokenizer
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True)
        >>> ds.format
        {'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
         'format_kwargs': {},
         'output_all_columns': False,
         'type': None}
        >>> ds = ds.with_format(type='tensorflow', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
        >>> ds.format
        {'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
         'format_kwargs': {},
         'output_all_columns': False,
         'type': 'tensorflow'}
        ```
        )r   r   r2  r   )r   r   r:  )rs   r   r   r2  r1  r   s         rt   r!  zDataset.with_format)
  s=    ^ -%%ngJ\nn`mnnnrv   c                 ^    t          j        |           }|                    |||           |S )a  Set `__getitem__` return format using this transform. The transform is applied on-the-fly on batches when `__getitem__` is called.

        As [`~datasets.Dataset.set_format`], this can be reset using [`~datasets.Dataset.reset_format`].

        Contrary to [`~datasets.Dataset.set_transform`], `with_transform` returns a new [`Dataset`] object.

        Args:
            transform (`Callable`, `optional`):
                User-defined formatting transform, replaces the format defined by [`~datasets.Dataset.set_format`].
                A formatting function is a callable that takes a batch (as a `dict`) as input and returns a batch.
                This function is applied right before returning the objects in `__getitem__`.
            columns (`List[str]`, `optional`):
                Columns to format in the output.
                If specified, then the input batch of the transform only contains those columns.
            output_all_columns (`bool`, defaults to `False`):
                Keep un-formatted columns as well in the output (as python objects).
                If set to `True`, then the other un-formatted columns are kept with the output of the transform.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> from transformers import AutoTokenizer
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        >>> def encode(example):
        ...     return tokenizer(example["text"], padding=True, truncation=True, return_tensors='pt')
        >>> ds = ds.with_transform(encode)
        >>> ds[0]
        {'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]),
         'input_ids': tensor([  101, 18027, 16310, 16001,  1103,  9321,   178, 11604,  7235,  6617,
                 1742,  2165,  2820,  1206,  6588, 22572, 12937,  1811,  2153,  1105,
                 1147, 12890, 19587,  6463,  1105, 15026,  1482,   119,   102]),
         'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                 0, 0, 0, 0, 0])}
        ```
        )r  r   r2  )r   r   r  )rs   r  r   r2  r   s        rt   with_transformzDataset.with_transform\
  s5    X -%%	7Wijjjrv   r   taskidc           
      V  	 t          t                    rd | j        j        pg D             }fd| j        j        pg D             }|s/t	          d dt          t          |                               d|cxk    rt          |          k     sFn d                    d t          |          D                       }t	          d| d	 d
|           ||         }n;t          t                    r}n#t	          d dt                     d          |                    | j        j                  }|j        		fd| j        D             }|                     |          }|                    	          }d|j        _        |                    |j                  }|S )a  
        Prepare a dataset for the given task by casting the dataset's [`Features`] to standardized column names and types as detailed in [`datasets.tasks`](./task_templates).

        Casts [`datasets.DatasetInfo.features`] according to a task-specific schema. Intended for single-use only, so all task templates are removed from [`datasets.DatasetInfo.task_templates`] after casting.

        Args:
            task (`Union[str, TaskTemplate]`):
                The task to prepare the dataset for during training and evaluation. If `str`, supported tasks include:

                - `"text-classification"`
                - `"question-answering"`

                If [`TaskTemplate`], must be one of the task templates in [`datasets.tasks`](./task_templates).
            id (`int`, defaults to `0`):
                The id required to unambiguously identify the task template when multiple task templates of the same type are supported.
        c                     g | ]	}|j         
S r   r  )r   rS  s     rt   r   z,Dataset.prepare_for_task.<locals>.<listcomp>
  s    TTTxX]TTTrv   c                 *    g | ]}|j         k    |S r   r  )r   rS  r  s     rt   r   z,Dataset.prepare_for_task.<locals>.<listcomp>
  s(    #w#w#waianrvavavHavavavrv   zTask z7 is not compatible with this dataset! Available tasks: r   r  c              3   ,   K   | ]\  }}d | d| V  dS )z- `z` for task Nr   )r   idxrS  s      rt   rP  z+Dataset.prepare_for_task.<locals>.<genexpr>
  sH       / /9Fh4#44(44/ / / / / /rv   zId z
 for task z) is not in a valid range. Supported ids:
z@Expected a `str` or `datasets.TaskTemplate` object but got task z with type rg  c                     g | ]}|v|	S r   r   )r   r   rR  s     rt   r   z,Dataset.prepare_for_task.<locals>.<listcomp>
  s$    bbbfVSaEaEa6EaEaEarv   NrZ  )r   r   rm   r   r   r   ra   r   r   	enumeraterV   r   align_with_featuresr   rR  r3  r  r  r  )
rs   r  r  taskscompatible_templatestemplates_list_strrS  columns_to_dropr   rR  s
    `       @rt   prepare_for_taskzDataset.prepare_for_task
  s   $ dC   	TTDI4L4RPRTTTE#w#w#w#wdi>V>\Z\#w#w#w '  uDuuY]^klq^r^rYsYsuu   6666S!5666666%)YY / /JSThJiJi/ / / & &" !l"llllXjll   ,B/HHl++ 	HHqSWqqdhimdndnqqq   //	0BCC!0bbbb0Abbb%%o66((88&*#,,(9,::rv   r   c                 N   d|v r|d         n| j         }d|v r|d         n| j        }d|v r|d         n| j        }d|v r|d         n| j        }||ni }t	          |fd| j        j        i|}t          | j        || j	        | j	        nd          }t          |||||          }	|	S )	z
        Can be used to index columns (by string names) or rows (by integer index, slices, or iter of indices or bools)
        format_typer  r2  r1  Nr   )r   r  )r5  r4  r7  r6  r@   rq   r   rA   r  r  r>   )
rs   r   r<  r	  r  r2  r1  r  r  r  s
             rt   r  zDataset._getitem
  s     0=/F/Ff]++DL]5E5O5O 011UYUi,@F,J,JF'((PTPh 	 4Cf3L3L//RVRe)6)B!+]]
8K]}]]	!$*cDMLe4==koppp'	.ew
 
 
  rv   c                     d S rp   r   rs   r   s     rt   __getitem__zDataset.__getitem__
      rv   c                     d S rp   r   r  s     rt   r  zDataset.__getitem__
  r  rv   c                 ,    |                      |          S )zjCan be used to index columns (by string names) or rows (by integer index or iterable of indices or bools).)r  r  s     rt   r  zDataset.__getitem__
  s    }}S!!!rv   r   c                     |                      |          t          t          t                                                 }fdt	          |          D             S )z<Can be used to get a batch using a list of integers indices.c                 R    g | ]"fd                                  D             #S )c                 (    i | ]\  }}||         S r   r   r   r  r   r   s      rt   r   z3Dataset.__getitems__.<locals>.<listcomp>.<dictcomp>
  s#    ???:3eAh???rv   r   )r   r   r   s    @rt   r   z(Dataset.__getitems__.<locals>.<listcomp>
  s7    [[[A???????[[[rv   )r  r   nextrH  r   )rs   r   
n_examplesr   s      @rt   __getitems__zDataset.__getitems__
  sX      &&tDKK00122
[[[[zIZIZ[[[[rv   c                    d | j         D             }|sdS t          j                            |d                   }t                              d|            t          j        |          }g }|D ]}t          j                            t          j                            ||                    }|	                    d          rL|
                    d          r7||v rt                              d|            |                    |           |D ]3}t                              d|            t          j        |           4t          |          S )a  Clean up all cache files in the dataset cache directory, excepted the currently used cache file if there is
        one.

        Be careful when running this command that no other process is currently using other cache files.

        Returns:
            `int`: Number of removed files.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.cleanup_cache_files()
        10
        ```
        c                 X    g | ]'}t           j                            |d                    (S r  )r  r  abspath)r   
cache_files     rt   r   z/Dataset.cleanup_cache_files.<locals>.<listcomp>
  s+    jjj:rwz*/EFFjjjrv   r   zListing files in cache-r  z%Keeping currently used cache file at z	Removing )r$  r  r  dirnamer  rm   listdirr  r   
startswithendswithr   r  r   )rs   current_cache_filescache_directoryfilesfiles_to_removef_name	full_name	file_paths           rt   cleanup_cache_fileszDataset.cleanup_cache_files
  sW   $ kjY]Yijjj" 	1'//*=a*@AA999::::o66 	2 	2F_f(M(MNNI  ** 2vx/H/H 2 333KK S	 S STTT&&y111( 	! 	!IKK/I//000Ii    ?###rv   c                 &   t                      r@| j        r9d|z   dz   }t          j                            | j        d         d                   }n"dt                      z   dz   }t                      }t          j                            ||          }|S )Nr  r  r   r  )r9   r$  r  r  r  r7   r8   r   )rs   rC  r  r!  cache_file_paths        rt   _get_cache_file_pathzDataset._get_cache_file_path  s     	DD$4 	D&4x?O good.>q.A*.MNNOO&)D)F)FFQOACCO',,HHrv   _{rank:05d}_of_{num_proc:05d}functionwith_indices	with_rankinput_columnsr  r  disable_nullable	fn_kwargssuffix_templater  c                 d   	
&'() 	rt          d          dk    rt          d          t                     dk    rh j        Ht           j                            dd           j                                         j                   |r 	                    |          S  S |d }t          |t                    r|g}|2|D ]/}| j        j        vrt          d| d j        j                   0t          |t                    r|g}|Xt           fd	|D                       r=t          d
t          t!           fd|                     d j        j                   

nt#                      
|i }`t                     k    rMt                     t$                              dt                      d dt                      d            ||||||||	|||d&Pt)          t          j                  }t-          t          j        d&          }d|d<   t/           j        ||          nt3                     &d<    j        r                               &d<   
fd}nd}|r|rt                     |z  |z  |z  |z  }nt                     }d}dk    rd}	  |&          }t$                              d&d                     n# t8          $ r Y nw xY w|t;          j        t;          j                     d|d|pd          5 }t          j        di &D ]F\  }}}|r)|dz  }t$                               d| d| d           |}1|!                    |           G	 ddd           n# 1 swxY w Y   |
J d            |j         j        k    r|_        |S dtD          t                   dtF          tH          tJ          d          f         d!tD          t                   ffd"'dt          dtH          d!t          ffd#(tM          tN          j(                  } | )                    d$d%          *                                d&vrt$                              d'           d%tN          j(        d$<   	 fd(tW                    D             )&'()fd)tW          |          D             }!dg|z  }"tW          |          D ],}	  ||!|                   |"|<   d|!|<   # t8          $ r Y )w xY wd* |!D             }!|!r[t          |!          |k     r.t$                              d+t          |!           d,| d-           tY          t          |!                    5 }#| tN          _(        t$                              d. d/           t;          j        t;          j                     d|d|pdd0 d1z             5 }t[          |#t          j        |!2          D ]I\  }}}|r,|dz  }t$                               d| d| d           ||"|<   4|!                    |           J	 ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   |!D ]}$|$d3= n't$                              d 'd                       d|"vsJ d4|" d5            t$                              d6 d7           t]          |"          }%t          d8 t_          |")          D                       r|%_        n j        |%_        |%S )9a@  
        Apply a function to all the examples in the table (individually or in batches) and update the table.
        If your function returns a column that already exists, then it overwrites it.

        You can specify whether the function should be batched or not with the `batched` parameter:

        - If batched is `False`, then the function takes 1 example in and should return 1 example.
          An example is a dictionary, e.g. `{"text": "Hello there !"}`.
        - If batched is `True` and `batch_size` is 1, then the function takes a batch of 1 example as input and can return a batch with 1 or more examples.
          A batch is a dictionary, e.g. a batch of 1 example is `{"text": ["Hello there !"]}`.
        - If batched is `True` and `batch_size` is `n > 1`, then the function takes a batch of `n` examples as input and can return a batch with `n` examples, or with an arbitrary number of examples.
          Note that the last batch may have less than `n` examples.
          A batch is a dictionary, e.g. a batch of `n` examples is `{"text": ["Hello there !"] * n}`.

        Args:
            function (`Callable`): Function with one of the following signatures:

                - `function(example: Dict[str, Any]) -> Dict[str, Any]` if `batched=False` and `with_indices=False` and `with_rank=False`
                - `function(example: Dict[str, Any], *extra_args) -> Dict[str, Any]` if `batched=False` and `with_indices=True` and/or `with_rank=True` (one extra arg for each)
                - `function(batch: Dict[str, List]) -> Dict[str, List]` if `batched=True` and `with_indices=False` and `with_rank=False`
                - `function(batch: Dict[str, List], *extra_args) -> Dict[str, List]` if `batched=True` and `with_indices=True` and/or `with_rank=True` (one extra arg for each)

                For advanced usage, the function can also return a `pyarrow.Table`.
                Moreover if your function returns nothing (`None`), then `map` will run your function and return the dataset unchanged.
                If no function is provided, default to identity function: `lambda x: x`.
            with_indices (`bool`, defaults to `False`):
                Provide example indices to `function`. Note that in this case the
                signature of `function` should be `def function(example, idx[, rank]): ...`.
            with_rank (`bool`, defaults to `False`):
                Provide process rank to `function`. Note that in this case the
                signature of `function` should be `def function(example[, idx], rank): ...`.
            input_columns (`Optional[Union[str, List[str]]]`, defaults to `None`):
                The columns to be passed into `function`
                as positional arguments. If `None`, a `dict` mapping to all formatted columns is passed as one argument.
            batched (`bool`, defaults to `False`):
                Provide batch of examples to `function`.
            batch_size (`int`, *optional*, defaults to `1000`):
                Number of examples per batch provided to `function` if `batched=True`.
                If `batch_size <= 0` or `batch_size == None`, provide the full dataset as a single batch to `function`.
            drop_last_batch (`bool`, defaults to `False`):
                Whether a last batch smaller than the batch_size should be
                dropped instead of being processed by the function.
            remove_columns (`Optional[Union[str, List[str]]]`, defaults to `None`):
                Remove a selection of columns while doing the mapping.
                Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding
                columns with names in `remove_columns`, these columns will be kept.
            keep_in_memory (`bool`, defaults to `False`):
                Keep the dataset in memory instead of writing it to a cache file.
            load_from_cache_file (`Optioanl[bool]`, defaults to `True` if caching is enabled):
                If a cache file storing the current computation from `function`
                can be identified, use it instead of recomputing.
            cache_file_name (`str`, *optional*, defaults to `None`):
                Provide the name of a path for the cache file. It is used to store the
                results of the computation instead of the automatically generated cache file name.
            writer_batch_size (`int`, defaults to `1000`):
                Number of rows per write operation for the cache file writer.
                This value is a good trade-off between memory usage during the processing, and processing speed.
                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.
            features (`Optional[datasets.Features]`, defaults to `None`):
                Use a specific Features to store the cache file
                instead of the automatically generated one.
            disable_nullable (`bool`, defaults to `False`):
                Disallow null values in the table.
            fn_kwargs (`Dict`, *optional*, defaults to `None`):
                Keyword arguments to be passed to `function`.
            num_proc (`int`, *optional*, defaults to `None`):
                Max number of processes when generating cache. Already cached shards are loaded sequentially.
            suffix_template (`str`):
                If `cache_file_name` is specified, then this suffix
                will be added at the end of the base name of each. Defaults to `"_{rank:05d}_of_{num_proc:05d}"`. For example, if `cache_file_name` is "processed.arrow", then for
                `rank=1` and `num_proc=4`, the resulting file would be `"processed_00001_of_00004.arrow"` for the default suffix.
            new_fingerprint (`str`, *optional*, defaults to `None`):
                The new fingerprint of the dataset after transform.
                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.
            desc (`str`, *optional*, defaults to `None`):
                Meaningful description to be displayed alongside with the progress bar while mapping examples.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> def add_prefix(example):
        ...     example["text"] = "Review: " + example["text"]
        ...     return example
        >>> ds = ds.map(add_prefix)
        >>> ds[0:3]["text"]
        ['Review: compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .',
         'Review: the soundtrack alone is worth the price of admission .',
         'Review: rodriguez does a splendid job of racial profiling hollywood style--casting excellent latin actors of all ages--a trend long overdue .']

        # process a batch of examples
        >>> ds = ds.map(lambda example: tokenizer(example["text"]), batched=True)
        # set number of processors
        >>> ds = ds.map(add_prefix, num_proc=4)
        ```
        NzEPlease use either `keep_in_memory` or `cache_file_name` but not both.r   z num_proc must be an integer > 0.rm   rn   rC  c                     | S rp   r   xs    rt   r  zDataset.map.<locals>.<lambda>  s     rv   zInput column r  c              3   4   K   | ]}|j         j        vV  d S rp   ry  r  s     rt   rP  zDataset.map.<locals>.<genexpr>  s.      -k-kUXc9P.P-k-k-k-k-k-krv   zColumn to remove c                      | j         j        vS rp   ry  r  s    rt   r  zDataset.map.<locals>.<lambda>  s    CtzG^<^ rv   znum_proc must be <= z. Reducing num_proc to z for dataset of size rg  )r	  r,  r-  r.  r/  r  r   r  r  r  r  r   r0  r1  r   r  fingerprint_namer  c                    | d         }| d         vt           j                            | d                   rQrO|j                                        }|_        d|_        t                              | d         ||j	                  S t          )zILoad a processed shard from cache if it exists, otherwise throw an error.r	  r  Nr~  )r  r  existsrm   r   r   r   r   r  rn   ry  )shard_kwargsr	  rm   r   r  s      rt   load_processed_shard_from_cachez4Dataset.map.<locals>.load_processed_shard_from_cache  s     )E-.:7>>,/@"ABB lG[ l :??,,D$,DM*.D'",,\:K-LSW_d_j,kkk))rv   r   z$Loading cached processed dataset at r  FMapr  z!Finished processing shard number r  z&Failed to retrieve the result from maprank*rz   c                 j   | s| S |                      d          }| d |         | |d          }}t          |t                    r>|                    |          z   |z   } t                              d| d|             n1|                    dd                              |          z   |z   } | S )Nrg  r@  r  z	Process #z will write at z
{rank:05d}z{rank})rindexr   r   r   r  rm   replace)r  r@  sep	base_name	extensionr  r2  s        rt   format_cache_file_namez+Dataset.map.<locals>.format_cache_file_name  s     ' +**%,,S11'6tt'<ocdd>S9	dC(( &//2H2Hd]e2H2f2f&fir&rOKK RD R R R RSSSS ")11,IIPPVZemPnno#$ $
 '&rv   c                 Z    |                      |          z   } t          |            | S )NrC  )r   r<   )r  r@  r  r2  s     rt   format_new_fingerprintz+Dataset.map.<locals>.format_new_fingerprint!  s6    "1O4J4JPT_g4J4h4h"h$_555&&rv   TOKENIZERS_PARALLELISMfalse) offrM  fr  n0z:Setting TOKENIZERS_PARALLELISM=false for forked processes.c                 B    g | ]}                     |d           S )T)r  ru  r  r  r  )r   r@  r  r  rs   s     rt   r   zDataset.map.<locals>.<listcomp>4  s>        

hdt\j
kk  rv   c                     g | ]G}i |          |          |t          d  d|         D                        |          dHS )c              3   4   K   | ]}t          |          V  d S rp   r   )r   ss     rt   rP  z)Dataset.map.<locals>.<listcomp>.<genexpr>>  s(      !@!@Q#a&&!@!@!@!@!@!@rv   N)r	  r  r@  offsetr  )sum)r   r@  r  dataset_kwargsrI  rK  r  shardss     rt   r   zDataset.map.<locals>.<listcomp>8  s     
 
 
 $#D\'='=ot'T'T !!@!@&$-!@!@!@@@'='=ot'T'T  
 
 
rv   c                     g | ]}||S rp   r   )r   r<  s     rt   r   zDataset.map.<locals>.<listcomp>L  s    XXXVEWfEWEWEWrv   zReprocessing r  z9 shards because some of them were missing from the cache.z	Spawning z
 processesz (num_proc=)r  r	  z1Failed to retrieve results from map: result list zG still contains None - at least one worker failed to return its resultszConcatenating z shardsc              3   <   K   | ]\  }}|j         |j         k    V  d S rp   r9  )r   transformed_shardr	  s      rt   rP  zDataset.map.<locals>.<genexpr>r  sF        ,%u ".%2DD     rv   )0r   r   r  r   r#  r  rm   r   rn   r  r   r   r  r3  r  r   r  r9   r  r  r5   _map_singler4   r;   r9  r<   r$  r*  ry  rW   r'  r(  r+  r,  r   r   r   rg   r   r  environrN  lowerr   r   r`   _concatenate_map_style_datasetszip)*rs   r,  r-  r.  r/  r  r   r  r  r  r  r  r  r   r0  r1  r  r2  r  r  input_columnr  kwargs_for_fingerprintr>  r  
pbar_totalr7  transformed_datasetr8  r@  r=  r>  prev_envr9  transformed_shardsr<  r<  resultrZ  rI  rK  r[  s*   `        ``` `  ```                   @@@@rt   r"  zDataset.map  s
   r  	fo9deeeHMM?@@@ t99>>}(IOOAq))))* /	    **>:::"{HmS)) 	,*OM$ -  tz'>>>$ E  E  Ekoku  lC  E  E   ?
 nc** 	.,-N%#-k-k-k-k\j-k-k-k*k*k% AD0^0^0^0^`n)o)o$p$p  A  A  hl  hr  h  A  A   8L7W33]o]q]qIHs4yy$8$84yyHNNts4yytttthklphqhqttt  
  ("*$.,,!2  0"
 
" " 99LMMI%B7CVXZ\j%k%k"9J"#5601BIOeffOO 111,;() 	M&"&";";O"L"L,;()
	* 
	* 
	* 
	* 
	* 
	* "*!5XX1
 	# 	#Tj0J>KjXJJTJx1}}"&&E&En&U&U#inUfFgiijjjj*   "*\ ' ? A AA$$   1 /6/B/T/T^/T/T 1 1+dG 1'1,K"LL)dT)d)dWa)d)d)deee29// KK000011 1 1 1 1 1 1 1 1 1 1 1 1 1 1 '224\222"/43DDD3B#0&&'!)#'6;C<M6N'#' ' ' ' ' ' '$' '3 '3 ' ' ' ' ' ' '
  
++H ||4g>>DDFF O   [\\\3:BJ/0     !(OO  F
 
 
 
 
 
 
 
 
 "*--
 
 
N #'*!4j))  /N/N~^bOc/d/d&t,+/N4((.   D YX>XXXN  v~&&33KK DN(;(;  D  Dj  D  D  D   #n--.. 5$!)BJKK @H @ @ @AAA $+$C$E$E E((#"me/HX/H/H/HH   5 3E '"5~4 4 4 5 5/D$  $ 5 +q 0 &-hQU-h-h[e-h-h-h i i i;B 24 8 8 $G 4 4 4 455 5 5 5 5 5 5 5 5 5 5 5 5 5 55 5 5 5 5 5 5 5 5 5 5 5 5 5 5( - ( (Fw( tF\F\]lnqFrFrttuuu.... _CU  _  _  _ /..KK::::;;;45GHHF  034F0O0O     8 '6##&*&7#Mso   .K? ?
LL>AN$$N(+N(;T
T"!T"AY;0A'Y$Y;$Y(	(Y;+Y(	,Y;;Y?Y?r@  rX  c              #   
   	
%&'(K   i r||dk    r j         }d' j                                        }s j        d|d<   t	           j        fd j        i|& G d dt                    %d (d%& '(fd
	}
	 fd}d}d\  }}}t          j                    5 }	  	                    d          }st          |          }nZ|st                     nt                     |z  |z  }t          t          d||          |                    ||                    }st          j                    }|D ]\  }} ||||          }'rn|dk    r# |            \  }}}|                    |           t#          |t$          j                  r|                    |           n|                    |           |dz  }t          j                    |t,          j        z   k    rt          j                    }d	|fV  d}ɐnmt          j                    }|D ]V\  }} t          |           }!t1          t          t3          |||z                                  j                              }"	  || |"t                                                     dk    |          } n# %$ r t9          d          dw xY w'rn|dk    r# |            \  }}}|                    |           t#          | t$          j                  r|                    |            n|                    |            ||!z  }t          j                    |t,          j        z   k    rt          j                    }d	|fV  d}X'r||                                 n# t          t@          f$ rt d	|fV  'ri||                                 |Q|!                                 tD          j#        $                    |j%                  rtE          j&        |j%                    w xY wddd           n# 1 swxY w Y   d	|fV  'rq|o|!                                 tO          j(        |j%        
           tE          j)        d          }#tE          j)        |#           tE          j*        
d|# z             'r j+                                        }$|j,        |$_        d|$_-        |)dt\          /                    
|$ j0                  fV  dS dt\          1                    |2                                |$ j0                  fV  dS d fV  dS )a   Apply a function to all the elements in the table (individually or in batches)
        and update the table (if function does update examples).

        Args:
            shard (`datasets.Dataset`): Dataset to map the transform on.
            function (`Callable`): with one of the following signature:
                - `function(example: Dict[str, Any]) -> Dict[str, Any]` if `batched=False` and `with_indices=False` and `with_rank=False`
                - `function(example: Dict[str, Any], *extra_args) -> Dict[str, Any]` if `batched=False` and `with_indices=True` and/or `with_rank=True` (one extra arg for each)
                - `function(batch: Dict[str, List]) -> Dict[str, List]` if `batched=True` and `with_indices=False` and `with_rank=False`
                - `function(batch: Dict[str, List], *extra_args) -> Dict[str, List]` if `batched=True` and `with_indices=True` and/or `with_rank=True` (one extra arg for each)

                For advanced usage, the function can also return a `pyarrow.Table`.
                Moreover if your function returns nothing (`None`), then `map` will run your function and return the dataset unchanged.
                If no function is provided, default to identity function: lambda x: x
            with_indices (`bool`, defaults to `False`): Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx[, rank]): ...`.
            with_rank (`bool`, default `False`): Provide process rank to `function`. Note that in this case the signature of `function` should be `def function(example[, idx], rank): ...`.
            input_columns (`Optional[List[str]]`, defaults to `None`): The columns to be passed into `function` as
                positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument.
            batched (`bool`, defaults to `False`): Provide batch of examples to `function`
            batch_size (`int`, optional, defaults to `1000`): Number of examples per batch provided to `function` if `batched=True`
                `batch_size <= 0` or `batch_size == None`: Provide the full dataset as a single batch to `function`
            drop_last_batch (`bool`, default: `False`): Whether a last batch smaller than the batch_size should be
                dropped instead of being processed by the function.
            remove_columns (`Optional[List[str]]`, defaults to `None`): Remove a selection of columns while doing the mapping.
                Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding
                columns with names in `remove_columns`, these columns will be kept.
            keep_in_memory (`bool`, defaults to `False`): Keep the dataset in memory instead of writing it to a cache file.
            cache_file_name (`str`, optional, defaults to `None`): Provide the name of a path for the cache file. It is used to store the
                results of the computation instead of the automatically generated cache file name.
            writer_batch_size (`int`, default `1000`): Number of rows per write operation for the cache file writer.
                This value is a good trade-off between memory usage during the processing, and processing speed.
                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `.map()`.
            features (`Optional[datasets.Features]`, defaults to `None`): Use a specific Features to store the cache file
                instead of the automatically generated one.
            disable_nullable (`bool`, defaults to `False`): Disallow null values in the table.
            fn_kwargs (`Dict`, optional, defaults to `None`): Keyword arguments to be passed to `function`
            new_fingerprint (`str`, optional, defaults to `None`): the new fingerprint of the dataset after transform.
                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments
            rank: (`int`, optional, defaults to `None`): If specified, this is the process rank when doing multiprocessing
            offset: (`int`, defaults to 0): If specified, this is an offset applied to the indices passed to `function` if `with_indices=True`.
        Nr   Tlazyr   c                       e Zd ZdS )5Dataset._map_single.<locals>.NumExamplesMismatchErrorNr-  r   rv   rt   NumExamplesMismatchErrorrp    s        Drv   rq  c                    | At          | t          t          j        f          s t	          dt          |            d          t          |t                    rt          | t                    r t          t          j        t          j
        ft          j        rdt          j        v rddl}|j        fz  t          j        rdt          j        v rddl}|j        fz  t          j        rdt          j        v rddlm} |j        fz  t-          fd|                                 D                       }|d	u r6t	          d
d |                                 D              d d          dS dS dS )z$Validate output of the map function.NzYProvided `function` which is applied to all elements of table returns a variable of type z. Make sure provided `function` returns a variable of type `dict` (or a pyarrow table) to update the dataset or `None` if you are only interested in side effects.r   r   torchjaxc              3   8   K   | ]}t          |          V  d S rp   )r   )r   r   allowed_batch_return_typess     rt   rP  zHDataset._map_single.<locals>.validate_function_output.<locals>.<genexpr>  s?       0 0FKJu&@AA0 0 0 0 0 0rv   FzXProvided `function` which is applied to all elements of table returns a `dict` of types c                 ,    g | ]}t          |          S r   r   )r   r7  s     rt   r   zIDataset._map_single.<locals>.validate_function_output.<locals>.<listcomp>  s?      t`  t`  t`  ABtxyzt{t{  t`  t`  t`rv   z[. When using `batched=True`, make sure provided `function` returns a `dict` of types like `z`.)r   r   rh  rN   ri  r   r   r   r   pdSeriesr    r   r  modulesr   r   TORCH_AVAILABLErs  JAX_AVAILABLE	jax.numpyr   rQ  r  )processed_inputsr   r   rs  jnpall_dict_values_are_listsrv  s         @rt   validate_function_outputz5Dataset._map_single.<locals>.validate_function_output  s%   +J?ORY[][cQd4e4e+ kpt  vF  qG  qG  k  k  k   GT** z:JG/T/T .2BJ	-J*& ?<3;+F+F++++.29,>.) Bg.D.D LLL.5</A.' AES[,@,@++++++.3;.@.,/ 0 0 0 0O_OfOfOhOh0 0 0 - -) -55# [  t`  t`  FV  F]  F]  F_  F_  t`  t`  t`  [  [  }W  [  [  [  %   " 65rv   Fc           	          t           sdnt           j                            gnfdD             }dk    r|}n(t          |t                    rfd|D             n|z   }d}r||fz  }r|fz  } g ||R i t          t
                    r(fdj                                        D             d}nd	}-t          t          t          j
        f           |           sdS j        sr5t          t           j                                                             }n>t          t
                    r' fd
j                                        D             }n}9D ]6}	|	|v r|                    |	           |r|	v r                    |	           7|r`t#                     }
t#          t%          t'                                                                                 }|
|k    r
             t          t                    rt          t                    ri |S S )z8Utility to apply the function on a selection of columns.r   )r  r  Nc                      g | ]
}|         S r   r   )r   r  inputss     rt   r   zRDataset._map_single.<locals>.apply_function_on_filtered_inputs.<locals>.<listcomp>  s    =c=c=ccfSk=c=c=crv   c                     g | ]}|z   S r   r   )r   r   rX  s     rt   r   zRDataset._map_single.<locals>.apply_function_on_filtered_inputs.<locals>.<listcomp>  s    $A$A$AAQZ$A$A$Arv   r   c                 .    i | ]\  }}|j         v||S r   keys_to_format)r   rO  r  r  s      rt   r   zRDataset._map_single.<locals>.apply_function_on_filtered_inputs.<locals>.<dictcomp>  s4     $ $ $!QaO_OnFnFnAqFnFnFnrv   TFc                 >    i | ]\  }}||j         vr|n|         S r   r  )r   rO  r  r  	pa_inputss      rt   r   zRDataset._map_single.<locals>.apply_function_on_filtered_inputs.<locals>.<dictcomp>  sC     # # #SWSTVWAQf&;;;1# # #rv   )r>   r   rw  r   r   rB   r#  r   r   rh  rN   r5  r   re  r3  itercolumnsr   r   r  rH  r   )r  r   check_same_num_examplesrX  fn_argseffective_indicesadditional_argsreturned_lazy_dictinputs_to_merger   input_num_examplesprocessed_inputs_num_examplesr  r  rq  r  r1  r,  r/  input_formatterr@  r  r	  update_datar  r-  r.  s   `  `        @@rt   !apply_function_on_filtered_inputsz>Dataset._map_single.<locals>.apply_function_on_filtered_inputs  s%    " ?eI,>&?&?,)	  F #0"7vhh=c=c=c=cUb=c=c=cG{{$+!!EOPWY]E^E^$t$A$A$A$A$A$A$A$Adkntdt! O 8$5#77 +D7*'xPP?PPPiPP*H55 +$ $ $ $%5%:%@%@%B%B$ $ $  &*""%*""()9GRX;NOO(()97CCC t! )] )"&s9+A9CXCXCZCZ'['["\"\FH-- )# # # # #[a[f[l[l[n[n# # # #)), 5 5F00'++F333) 5f8H.H.H(,,V444& 5%(^^"034DT$O_OdOdOfOfJgJgEhEh4i0j0j-%)FFF22444&'** (z:JG/T/T ( ?/>-=>>''rv   c                  f   } | 

j         } d}nd}s+t          j                    }d }t          | ||	          }nmd }t                              d            t          j        dt          j	        
                              d          }t          | |j        |	          }|||fS )NTF)r   streamr  update_featuresrC  r0  zCaching processed dataset at wbdirdelete)r   r  r  r  rC  r0  )r   rh  BufferOutputStreamr"   r  rm   tempfileNamedTemporaryFiler  r  r  name)writer_featuresr  
buf_writertmp_filerN  r  r0  r   r  r  r	  r  s        rt   init_buffer_and_writerz3Dataset._map_single.<locals>.init_buffer_and_writer-  s    &O&"'."&"' !8244
$,%&7$3 /%5   "
MOMMNNN#6tQ`AaAajoppp$,!&7$3 /%5   vx//rv   NNNrE  )r  )rX  r   )r  rX  zUsing `.map` in batched mode on a dataset with attached indexes is allowed only if it doesn't create or remove existing examples. You can first run `.drop_index() to remove your index and then re-add it.  r~  )Fr   )3rw  r6  r   r5  r@   r   	Exception
contextlib	ExitStackr!  r  r   re  r   rH  rG  enter_contextr   rh  rN   	write_rowwriter    rJ  r   r  r   r"  r,  rI  write_batchrK  KeyboardInterruptrL  r  r  r<  r  r  shutilmoveumaskchmodrm   	_featuresr   r   r  rn   r  getvalue))r	  r,  r-  r.  r/  r  r   r  r  r  r  r  r   r0  r1  r  r@  rX  r1  r  r  rM  r  rN  r  stackarrow_formatted_shardshard_iterablerw  rO  r   exampler   num_examples_in_batchr   r  rm   rq  r  r  r  s)   ``````  `````````                    @@@@rt   ra  zDataset._map_single{  s     | I  	(
*jAooJ ,1133 	)!3!;$(M&!'
 
^
 
 
	 	 	 	 	y 	 	 		 	 	8<	( <	( <	( <	( <	( <	( <	( <	( <	( <	( <	( <	( <	( <	( <	( <	( <	( <	(|	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0B ()$'7$
FH !## H	uG(-(9(9'(B(B%  %./D%E%ENN1@ks5zzzc%jjT^F^akFkH%(a:66-22:2__& &N  /= IKKE&4 = =
7"C"CGQW]"^"^"^& 6 Avv?U?U?W?W <
FH % 3 3F ; ; ;)'28<< 6 & 0 0 9 9 9 9 &W 5 5 54949;;1R)RRR$(IKKE"&/K"KKKK;<8=  !IKKE$2 = =503E

-"&!E!Q^$<$<$D$DU^$T$TV# #
($E$E % '8;E<N<N<P<P8Q8QTU8U'-	% % %EE  8 ( ( ("F !n# ##'(( ' : Avv?U?U?W?W <
FH % 3 3F ; ; ;)%:: : & 2 25 9 9 9 9 & 2 25 9 9 948MM49;;1R)RRR$(IKKE"&/K"KKKK;<8 &6#5OO%%%01 	 	 	E#????? 5))))+ (((7>>(-88 5Ihm444	H	 H	 H	 H	 H	 H	 H	 H	 H	 H	 H	 H	 H	 H	 H	T E77777 	68/NNK777HUOOEHUOOOH_eufn555 
	$:??$$D",DM"&D!D'"3"3O$V[Va"3"b"bbbbbbbD'"5"5j6I6I6K6KRV^c^i"5"j"jjjjjjje######sE   ;P:=GN&
2J=<N&=KCN&%P:&BP++P::P>P>)r  r  r  z2.0.1)r  ignore_kwargsr   c                    t          |                                           dk    rt          d          |d }t          |           dk    r| S |                     t	          t
          ||||| j                  dt          dt          d          i          d|| j	        ||||	|
|||||pd	          }t          j        |           }|j        |_        ||_        |S )
a>  Apply a filter function to all the elements in the table in batches
        and update the table so that the dataset only includes examples according to the filter function.

        Args:
            function (`Callable`): Callable with one of the following signatures:

                - `function(example: Dict[str, Any]) -> bool` if `with_indices=False, batched=False`
                - `function(example: Dict[str, Any], indices: int) -> bool` if `with_indices=True, batched=False`
                - `function(example: Dict[str, List]) -> List[bool]` if `with_indices=False, batched=True`
                - `function(example: Dict[str, List], indices: List[int]) -> List[bool]` if `with_indices=True, batched=True`

                If no function is provided, defaults to an always `True` function: `lambda x: True`.
            with_indices (`bool`, defaults to `False`):
                Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx): ...`.
            input_columns (`str` or `List[str]`, *optional*):
                The columns to be passed into `function` as
                positional arguments. If `None`, a `dict` mapping to all formatted columns is passed as one argument.
            batched (`bool`, defaults to `False`):
                Provide batch of examples to `function`.
            batch_size (`int`, *optional*, defaults to `1000`):
                Number of examples per batch provided to `function` if
                `batched = True`. If `batched = False`, one example per batch is passed to `function`.
                If `batch_size <= 0` or `batch_size == None`, provide the full dataset as a single batch to `function`.
            keep_in_memory (`bool`, defaults to `False`):
                Keep the dataset in memory instead of writing it to a cache file.
            load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled):
                If a cache file storing the current computation from `function`
                can be identified, use it instead of recomputing.
            cache_file_name (`str`, *optional*):
                Provide the name of a path for the cache file. It is used to store the
                results of the computation instead of the automatically generated cache file name.
            writer_batch_size (`int`, defaults to `1000`):
                Number of rows per write operation for the cache file writer.
                This value is a good trade-off between memory usage during the processing, and processing speed.
                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.
            fn_kwargs (`dict`, *optional*):
                Keyword arguments to be passed to `function`.
            num_proc (`int`, *optional*):
                Number of processes for multiprocessing. By default it doesn't
                use multiprocessing.
            suffix_template (`str`):
                If `cache_file_name` is specified, then this suffix will be added at the end of the base name of each.
                For example, if `cache_file_name` is `"processed.arrow"`, then for `rank = 1` and `num_proc = 4`,
                the resulting file would be `"processed_00001_of_00004.arrow"` for the default suffix (default
                `_{rank:05d}_of_{num_proc:05d}`).
            new_fingerprint (`str`, *optional*):
                The new fingerprint of the dataset after transform.
                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.
            desc (`str`, *optional*, defaults to `None`):
                Meaningful description to be displayed alongside with the progress bar while filtering examples.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.filter(lambda x: x["label"] == 1)
        Dataset({
            features: ['text', 'label'],
            num_rows: 533
        })
        ```
        r   zUsing `.filter` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it.`Nc                     dS )NTr   r6  s    rt   r  z Dataset.filter.<locals>.<lambda>  s     rv   Tr   uint64Filter)r,  r-  r   r  r   r  r  r  r  r  r1  r  r2  r  r/  r  )r   r"  r,  r"  r   get_indices_from_mask_functionr  r(   r*   r3  r   r   r#  r9  )rs   r,  r-  r/  r  r   r  r  r  r  r1  r  r2  r  r  r   new_datasets                    rt   r  zDataset.filter  s	   h t  ""##a''6 `   %~Ht99>>K((.'<Q^`d`m  y%//:;;!,)!5+/++'!%  
 
( mD))&|#2 rv   )r  r  c                 >    |                      d||||||d|	  	        S )aB  Create and cache a new Dataset by flattening the indices mapping.

        Args:
            keep_in_memory (`bool`, defaults to `False`):
                Keep the dataset in memory instead of writing it to a cache file.
            cache_file_name (`str`, *optional*, default `None`):
                Provide the name of a path for the cache file. It is used to store the
                results of the computation instead of the automatically generated cache file name.
            writer_batch_size (`int`, defaults to `1000`):
                Number of rows per write operation for the cache file writer.
                This value is a good trade-off between memory usage during the processing, and processing speed.
                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.
            features (`Optional[datasets.Features]`, defaults to `None`):
                Use a specific [`Features`] to store the cache file
                instead of the automatically generated one.
            disable_nullable (`bool`, defaults to `False`):
                Allow null values in the table.
            num_proc (`int`, optional, default `None`):
                Max number of processes when generating cache. Already cached shards are loaded sequentially
            new_fingerprint (`str`, *optional*, defaults to `None`):
                The new fingerprint of the dataset after transform.
                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments
        TzFlattening the indices)	r  r  r  r  r   r0  r  r  r  )r"  )rs   r  r  r  r   r0  r  r  s           rt   r  zDataset.flatten_indices)  s;    H xx)+/-+)  

 

 
	
rv   indices_cache_file_namec                 
   ||t          d          |t          d          |t          j        |          }nt          j        |          }t          | j        | j                                        | j	        ||          S )zReturn a new Dataset obtained by adding indices (provided in indices_cache_file_name or in a buffer) to the
        current Dataset.
        NzKAt least one of indices_cache_file_name or indices_buffer must be provided.z9please specify a fingerprint for the dataset with indicesrm   rn   r|  rC  )
r   rM   r  rL   r  r   r  rm   r   rn   )rs   r  r  rC  r|  s        rt   _new_dataset_with_indicesz!Dataset._new_dataset_with_indicesY  s     #*~/EjkkkXYYY".-78OPPMM)5nEEM J!!*'#
 
 
 	
rv   r   c                    |r|t          d          t          |                                           dk    rt          d          t          |           dk    r| S t	          |t
          j        t
          j        f          r1|                                	                    t          j                  }t	          |t                    rt          |          }t	          |t                    rIt          |          r9|j        dk    r.|j        |j        |j        z
  }}|                     |||          S n	 t'          t)          |                    }n(# t*          $ r |                     dd|          cY S w xY w|dk    rft-          j        |          }t1          d t3          ||          D                       r*t'          |          |z
  }|                     |||          S |                     |||||          S )	am  Create a new dataset with rows selected following the list/array of indices.

        Args:
            indices (`range`, `list`, `iterable`, `ndarray` or `Series`):
                Range, list or 1D-array of integer indices for indexing.
                If the indices correspond to a contiguous range, the Arrow table is simply sliced.
                However passing a list of indices that are not contiguous creates indices mapping, which is much less efficient,
                but still faster than recreating an Arrow table made of the requested rows.
            keep_in_memory (`bool`, defaults to `False`):
                Keep the indices mapping in memory instead of writing it to a cache file.
            indices_cache_file_name (`str`, *optional*, defaults to `None`):
                Provide the name of a path for the cache file. It is used to store the
                indices mapping instead of the automatically generated cache file name.
            writer_batch_size (`int`, defaults to `1000`):
                Number of rows per write operation for the cache file writer.
                This value is a good trade-off between memory usage during the processing, and processing speed.
                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.
            new_fingerprint (`str`, *optional*, defaults to `None`):
                The new fingerprint of the dataset after transform.
                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.select(range(4))
        Dataset({
            features: ['text', 'label'],
            num_rows: 4
        })
        ```
        NMPlease use either `keep_in_memory` or `indices_cache_file_name` but not both.r   Using `.select` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it.r  )startc              3   (   K   | ]\  }}||k    V  d S rp   r   )r   r   js      rt   rP  z!Dataset.select.<locals>.<genexpr>  s*      KK$!QqAvKKKKKKrv   )r  r  r  r  )r   r   r"  r,  r   rh  r  r  to_numpyastyper   r   r   r   r   rC   r  stop_select_contiguousr  rH  StopIteration	itertoolsrm  rQ  re  _select_with_indices_mapping)	rs   r   r  r  r  r  r  lengthcounter_from_starts	            rt   r  zDataset.selectx  s0   V  	n5Almmmt  ""##a''6 _  
 t99>>K g"/:;; 	:&&((//99G gx(( 	$7mmG gu%% 	c#G,, _!1C1C 'w|gm/Kv..ufo.^^^VT']]++  V V V..q!_.UUUUUV zz%._5%A%A%A"KK#g7I*J*JKKKKK c!"455=F225&Ra2bbb 00)$;/+ 1 
 
 	
s   7E "E98E9r  r  c           	      Z   t          |                                           dk    rt          d          t          |           dk    r| S t          |t          |                      t          ||z   dz
  t          |                      | j        |dk    rHt          | j                            ||          | j        	                                | j
        |          S t          | j        | j        	                                | j
        | j                            ||          |          S )a  Create a new dataset with rows from a contiguous slice of data.
        The slice is defined by that start index and its length.

        Args:
            start (`int`): start index.
            length (`int`): length of the slice to select.
            new_fingerprint (`str`, optional, default `None`): the new fingerprint of the dataset after transform.
                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds._select_contiguous(0, 4)
        Dataset({
            features: ['text', 'label'],
            num_rows: 4
        })
        ```
        r   r  r   Nr4  r  )r   r"  r,  rw  r  r   r#  r  rm   r   rn   )rs   r  r  r  s       rt   r  zDataset._select_contiguous  s!   : t  ""##a''6 _  
 t99>>K"5#d))444"56>A#5s4yyAAA= FaKK	v..Y^^%%j+	    	Y^^%%j"m11%@@+   rv   c                    |r|t          d          t          |                                           dk    rt          d          t          |           dk    r| S |s|)t	          j                    }d}t          |||d          }nkd}t                              d|            t          j
        dt          j                            |          d	
          }t          |j        ||d          }t          |t                     r|nt!          |          }t          |           }	|rWt#          t%          t'          |                    |	           t#          t%          t)          |                    |	           n|                     dd|          S t	          j        |t	          j                              }
| j        -| j                            d                              |
          }
t          j                            |
gdg          }|5  	 |                    |           |                                 ni# t>          t@          f$ rU |Q|!                                 t          j        "                    |j                  rt          j#        |j                    w xY w	 ddd           n# 1 swxY w Y   |o|!                                 tI          j%        |j        |           t          j&        d          }t          j&        |           t          j'        |d| z             || (                    ||          S | (                    |)                                |          S )a6  Create a new dataset with rows selected following the list/array of indices.
        The new dataset is made by creating a new indices mapping on top of the main arrow table.

        Args:
            indices (sequence, iterable, range, ndarray or Series): List or 1D-array of integer indices for indexing.
            keep_in_memory (`bool`, default `False`): Keep the indices mapping in memory instead of writing it to a cache file.
            indices_cache_file_name (`str`, optional, default `None`): Provide the name of a path for the cache file. It is used to store the
                indices mapping instead of the automatically generated cache file name.
            writer_batch_size (`int`, default `1000`): Number of rows per write operation for the cache file writer.
                This value is a good trade-off between memory usage during the processing, and processing speed.
                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `.map()`.
            new_fingerprint (`str`, optional, default `None`): the new fingerprint of the dataset after transform.
                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds._select_with_indices_mapping(range(4))
        Dataset({
            features: ['text', 'label'],
            num_rows: 4
        })
        ```
        Nr  r   r  r   )r  r  rC  r  zCaching indices mapping at r  Fr  )r  r  rC  r  )rv  r  rx  r  r  )r  rC  )r  rC  )*r   r   r"  r,  rh  r  r"   r  rm   r  r  r  r  r  r  r   r   rw  r   r  r   r  r   r  r  r   takerN   from_arraysrI  rK  r  r  rL  r<  r  r  r  r  r  r  r  )rs   r   r  r  r  r  r  r  rN  rv  indices_arrayr|  r  s                rt   r  z$Dataset._select_with_indices_mapping	  s   H  	n5Almmmt  ""##a''6 _  
 t99>>K  	4<.00JH !5FTcjs  FF JKKO6MOOPPP24RW__Md=e=enstttH ]6GUdkt  F (66I''DMM4yy 	R&s3w<<'8'8tDDDD&s3w<<'8'8tDDDDD**1a*QQQry{{;;;=$ M003388GGM,,m_YK,PP 		 		""=111!!!!01   'NN$$$w~~hm44 1	(-000 "		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 NNK'>???HUOOEHUOOOH,eufn=== 11(?_ 2    11ATATAVAVds1ttts+   J=)IJ=A&J--J==KKat_endreversenull_placementc
           	         t          |                                           dk    rt          d          t          |           dk    r| S |dk    rt          j        dt
                     t          |t                    s|g}t          |t                    s0t          |          t          |          k    rt          d          n|gt          |          z  }|D ]D}
t          |
t                    r|
| j        j        vrt          d|
 d| j        j                   E|d	vr%|d
k    rd}n|dk    rd}nt          d| d          ||nt                      }| j        rl||                     |	          }t           j                            |          r6|r4t&                              d|            |                     |	|          S t-          | j        t/          dt          |                     | j        | j        nd          }d t3          ||          D             }t5          j        |||          }|                     |||||	          S )a  Create a new dataset sorted according to a single or multiple columns.

        Args:
            column_names (`Union[str, Sequence[str]]`):
                Column name(s) to sort by.
            reverse (`Union[bool, Sequence[bool]]`, defaults to `False`):
                If `True`, sort by descending order rather than ascending. If a single bool is provided,
                the value is applied to the sorting of all column names. Otherwise a list of bools with the
                same length and order as column_names must be provided.
            kind (`str`, *optional*):
                Pandas algorithm for sorting selected in `{quicksort, mergesort, heapsort, stable}`,
                The default is `quicksort`. Note that both `stable` and `mergesort` use `timsort` under the covers and, in general,
                the actual implementation will vary with data type. The `mergesort` option is retained for backwards compatibility.
                <Deprecated version="2.8.0">

                `kind` was deprecated in version 2.10.0 and will be removed in 3.0.0.

                </Deprecated>
            null_placement (`str`, defaults to `at_end`):
                Put `None` values at the beginning if `at_start` or `first` or at the end if `at_end` or `last`

                <Added version="1.14.2"/>
            keep_in_memory (`bool`, defaults to `False`):
                Keep the sorted indices in memory instead of writing it to a cache file.
            load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled):
                If a cache file storing the sorted indices
                can be identified, use it instead of recomputing.
            indices_cache_file_name (`str`, *optional*, defaults to `None`):
                Provide the name of a path for the cache file. It is used to store the
                sorted indices instead of the automatically generated cache file name.
            writer_batch_size (`int`, defaults to `1000`):
                Number of rows per write operation for the cache file writer.
                Higher value gives smaller cache files, lower value consume less temporary memory.
            new_fingerprint (`str`, *optional*, defaults to `None`):
                The new fingerprint of the dataset after transform.
                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset('rotten_tomatoes', split='validation')
        >>> ds['label'][:10]
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
        >>> sorted_ds = ds.sort('label')
        >>> sorted_ds['label'][:10]
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        >>> another_sorted_ds = ds.sort(['label', 'text'], reverse=[True, False])
        >>> another_sorted_ds['label'][:10]
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
        ```
        r   zUsing `.sort` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it.r  zE'kind' was deprecated in version 2.10.0 and will be removed in 3.0.0.)categoryzlParameter 'reverse' should be either a boolean or a list of booleans with the same length as 'column_names'.zColumn 'zA' not found in the dataset. Please provide a column selected in: )at_startr  firstr  lastr  znull_placement 'zX' is an invalid parameter value. Must be either 'last', 'at_end', 'first' or 'at_start'.Nz-Loading cached sorted indices for dataset at rC  r  rU  r   r   c                 $    g | ]\  }}||sd ndfS )	ascending
descendingr   )r   r  col_reverses      rt   r   z Dataset.sort.<locals>.<listcomp>  s7     
 
 
HX[S[B++lC
 
 
rv   )r  r  r   r  r  r  r  )r   r"  r,  r  r  r  r   r   r   r   r   r  r3  r9   r$  r*  r  r  r<  r  r  r  rA   r  r  re  pcsort_indicesr  )rs   r3  r  r   r  r  r  r  r  r  r   
sort_tabler  r   s                 rt   sortzDataset.sortr  s   D t  ""##a''6 ]   t99>>K <MW&    ,-- 	*(>L '4(( 	47||s<0000  C   1
 i#l"3"33G # 	 	Ffc** fDJ<S.S.S  Bv  B  Bhlhrh  B  B   /T !777((!+6))!)  P~  P  P  P   8L7W33]o]q]q  	&.*.*C*CO*T*T'w~~566 ;O hOfhhiii55 /I` 6    !*aT##%)]%>DMMD
 
 


 
\_`lnu\v\v
 
 
	 /*	R`aaa{{)$;/+  
 
 	
rv   )r  randomized_functionr  seedc                    t          |                                           dk    rt          d          t          |           dk    r| S |r|t          d          ||t          d          |.t	          |t
          j        j                  st          d          ||nt                      }|w|Vt
          j        	                                ^}}}	}|	dk     r||	         n|d         }t
          j                                        }t
          j        
                    |          }| j        rl||                     |          }t          j                            |          r6|r4t                               d|            |                     ||	          S |                    t          |                     }
|                     |
||s|nd||
          S )a  Create a new Dataset where the rows are shuffled.

        Currently shuffling uses numpy random generators.
        You can either supply a NumPy BitGenerator to use, or a seed to initiate NumPy's default random generator (PCG64).

        Shuffling takes the list of indices `[0:len(my_dataset)]` and shuffles it to create an indices mapping.
        However as soon as your [`Dataset`] has an indices mapping, the speed can become 10x slower.
        This is because there is an extra step to get the row index to read using the indices mapping, and most importantly, you aren't reading contiguous chunks of data anymore.
        To restore the speed, you'd need to rewrite the entire dataset on your disk again using [`Dataset.flatten_indices`], which removes the indices mapping.
        This may take a lot of time depending of the size of your dataset though:

        ```python
        my_dataset[0]  # fast
        my_dataset = my_dataset.shuffle(seed=42)
        my_dataset[0]  # up to 10x slower
        my_dataset = my_dataset.flatten_indices()  # rewrite the shuffled dataset on disk as contiguous chunks of data
        my_dataset[0]  # fast again
        ```

        In this case, we recommend switching to an [`IterableDataset`] and leveraging its fast approximate shuffling method [`IterableDataset.shuffle`].
        It only shuffles the shards order and adds a shuffle buffer to your dataset, which keeps the speed of your dataset optimal:

        ```python
        my_iterable_dataset = my_dataset.to_iterable_dataset(num_shards=128)
        for example in enumerate(my_iterable_dataset):  # fast
            pass

        shuffled_iterable_dataset = my_iterable_dataset.shuffle(seed=42, buffer_size=100)

        for example in enumerate(shuffled_iterable_dataset):  # as fast as before
            pass
        ```

        Args:
            seed (`int`, *optional*):
                A seed to initialize the default BitGenerator if `generator=None`.
                If `None`, then fresh, unpredictable entropy will be pulled from the OS.
                If an `int` or `array_like[ints]` is passed, then it will be passed to SeedSequence to derive the initial BitGenerator state.
            generator (`numpy.random.Generator`, *optional*):
                Numpy random Generator to use to compute the permutation of the dataset rows.
                If `generator=None` (default), uses `np.random.default_rng` (the default BitGenerator (PCG64) of NumPy).
            keep_in_memory (`bool`, default `False`):
                Keep the shuffled indices in memory instead of writing it to a cache file.
            load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled):
                If a cache file storing the shuffled indices
                can be identified, use it instead of recomputing.
            indices_cache_file_name (`str`, *optional*):
                Provide the name of a path for the cache file. It is used to store the
                shuffled indices instead of the automatically generated cache file name.
            writer_batch_size (`int`, defaults to `1000`):
                Number of rows per write operation for the cache file writer.
                This value is a good trade-off between memory usage during the processing, and processing speed.
                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.
            new_fingerprint (`str`, *optional*, defaults to `None`):
                The new fingerprint of the dataset after transform.
                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds['label'][:10]
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

        # set a seed
        >>> shuffled_ds = ds.shuffle(seed=42)
        >>> shuffled_ds['label'][:10]
        [1, 0, 1, 1, 0, 0, 0, 0, 0, 0]
        ```
        r   zUsing `.shuffle` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it.Nr  zKBoth `seed` and `generator` were provided. Please specify just one of them.zDThe provided generator must be an instance of numpy.random.Generatorp  z/Loading cached shuffled indices for dataset at r  r  )r   r"  r,  r   r   r   random	Generatorr9   	get_statedefault_rngr$  r*  r  r  r<  r  r  r  permutationr  )rs   r  r  r  r  r  r  r  r   posr  s              rt   r   zDataset.shuffle  s	   j t  ""##a''6 `   t99>>K 	n5Almmm	 5jkkk Iry?R)S)S cddd7K7W33]o]q]q|#%9#6#6#8#8 4q$'#IItCyy47I$$&&	--d33I  	&.*.*C*CO*T*T'w~~566 ;O jQhjjkkk55 /I` 6     ++CII66{{)CQ$[$;$;W[/+  
 
 	
rv   train_new_fingerprinttest_new_fingerprint)r  train_indices_cache_file_nametest_indices_cache_file_name)r  r  fingerprint_namesr  	test_size
train_sizer   stratify_by_columnr  r  ri   c                    ddl m} t          |                                           dk    rt	          d          t          |           dk    r || | d          S ||d}t          |           }t          |t                    r||k    s'|dk    s!t          |t                    r"|dk    s|dk    rt          d| d	| d
          t          |t                    r||k    s'|dk    s!t          |t                    r"|dk    s|dk    rt          d| d	| d
          |>t          |t          t          f          s"t          d| dt          |                     |>t          |t          t          f          s"t          d| dt          |                     t          |t                    r4t          |t                    r||z   dk    rt          d||z    d          t          |t                    rt          ||z            }n$t          |t                    rt          |          }t          |t                    rt          ||z            }n$t          |t                    rt          |          }|||z
  }n|||z
  }||z   |k    rt          d||z    d| d          t          |          t          |          }}|dk    rt          d| d| d| d          ||nt                      }|{|du rw|Vt          j                                        ^}}}}|dk     r||         n|d         }t          j                                        }t          j                            |          }| j        r|	|
.|	|                     |          }	|
|                     |          }
t&          j                            |	          ryt&          j                            |
          rZ|rXt,                              d|	 d|
             ||                     ||	          |                     ||
          d          S |s?|t          d          t          j        |          }t          j        |||z             }no|3|| j        j                                        vr1t          d| d| j        j                                                   t          | j        j        |         t:                    sEt          d t:          j         d!| d"t          | j        j        |                   j         d#          	 t?          tA          | !                    d$          |         |||%                    \  }}ns# tD          $ r-}tG          |          d&k    rt          d'| d(          |d}~ww xY w|$                    t          |                     }|d|         }||||z            }| %                    |||	||)          }| %                    |||
||)          } |||d          S )*au  Return a dictionary ([`datasets.DatasetDict`]) with two random train and test subsets (`train` and `test` `Dataset` splits).
        Splits are created from the dataset according to `test_size`, `train_size` and `shuffle`.

        This method is similar to scikit-learn `train_test_split`.

        Args:
            test_size (`numpy.random.Generator`, *optional*):
                Size of the test split
                If `float`, should be between `0.0` and `1.0` and represent the proportion of the dataset to include in the test split.
                If `int`, represents the absolute number of test samples.
                If `None`, the value is set to the complement of the train size.
                If `train_size` is also `None`, it will be set to `0.25`.
            train_size (`numpy.random.Generator`, *optional*):
                Size of the train split
                If `float`, should be between `0.0` and `1.0` and represent the proportion of the dataset to include in the train split.
                If `int`, represents the absolute number of train samples.
                If `None`, the value is automatically set to the complement of the test size.
            shuffle (`bool`, *optional*, defaults to `True`):
                Whether or not to shuffle the data before splitting.
            stratify_by_column (`str`, *optional*, defaults to `None`):
                The column name of labels to be used to perform stratified split of data.
            seed (`int`, *optional*):
                A seed to initialize the default BitGenerator if `generator=None`.
                If `None`, then fresh, unpredictable entropy will be pulled from the OS.
                If an `int` or `array_like[ints]` is passed, then it will be passed to SeedSequence to derive the initial BitGenerator state.
            generator (`numpy.random.Generator`, *optional*):
                Numpy random Generator to use to compute the permutation of the dataset rows.
                If `generator=None` (default), uses `np.random.default_rng` (the default BitGenerator (PCG64) of NumPy).
            keep_in_memory (`bool`, defaults to `False`):
                Keep the splits indices in memory instead of writing it to a cache file.
            load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled):
                If a cache file storing the splits indices
                can be identified, use it instead of recomputing.
            train_cache_file_name (`str`, *optional*):
                Provide the name of a path for the cache file. It is used to store the
                train split indices instead of the automatically generated cache file name.
            test_cache_file_name (`str`, *optional*):
                Provide the name of a path for the cache file. It is used to store the
                test split indices instead of the automatically generated cache file name.
            writer_batch_size (`int`, defaults to `1000`):
                Number of rows per write operation for the cache file writer.
                This value is a good trade-off between memory usage during the processing, and processing speed.
                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.
            train_new_fingerprint (`str`, *optional*, defaults to `None`):
                The new fingerprint of the train set after transform.
                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments
            test_new_fingerprint (`str`, *optional*, defaults to `None`):
                The new fingerprint of the test set after transform.
                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds = ds.train_test_split(test_size=0.2, shuffle=True)
        DatasetDict({
            train: Dataset({
                features: ['text', 'label'],
                num_rows: 852
            })
            test: Dataset({
                features: ['text', 'label'],
                num_rows: 214
            })
        })

        # set a seed
        >>> ds = ds.train_test_split(test_size=0.2, seed=42)

        # stratified split
        >>> ds = load_dataset("imdb",split="train")
        Dataset({
            features: ['text', 'label'],
            num_rows: 25000
        })
        >>> ds = ds.train_test_split(test_size=0.2, stratify_by_column="label")
        DatasetDict({
            train: Dataset({
                features: ['text', 'label'],
                num_rows: 20000
            })
            test: Dataset({
                features: ['text', 'label'],
                num_rows: 5000
            })
        })
        ```
        r   rh   r   zUsing `.train_test_split` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it.)traintestNg      ?z
test_size=zB should be either positive and smaller than the number of samples z or a float in the (0, 1) rangeztrain_size=zInvalid value for train_size: z	 of type zInvalid value for test_size: z&The sum of test_size and train_size = zD, should be in the (0, 1) range. Reduce test_size and/or train_size.z&The sum of train_size and test_size = z/, should be smaller than the number of samples z%. Reduce test_size and/or train_size.zWith n_samples=z, test_size=z and train_size=zU, the resulting train set will be empty. Adjust any of the aforementioned parameters.Tr  z,Loading cached split indices for dataset at z and r  zBStratified train/test split is not implemented for `shuffle=False`zKey z not found in z,Stratifying by column is only supported for r  r  rg  r   )rngzMinimum class count errorzThe least populated class in zn column has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.r  )&dataset_dictri   r   r"  r,  r   r   floatr   r   r	   r
   r9   r   r  r  r  r$  r*  r  r  r<  r  r  r  arangerq   r   r   r'   r   r  rb   r!  r  r   r  r  )rs   r  r   r   r  r  r  r  r  r  r  r  r  r  ri   	n_samplesn_testn_trainr   r  train_indicestest_indiceserrorr  train_split
test_splits                             rt   train_test_splitzDataset.train_test_split  s+   ` 	.-----t  ""##a''6 i   t99>>;t<<===!3I II	y#&&		i''9>>)U++ ,:a9>>YY Y Y.7Y Y Y   z3''		y((J!OO*e,, -<qJ!OOYj Y Y.7Y Y Y  
 !*Z#u*N*N!ejeeSWXbScSceefff IU|)L)L bYbbQUV_Q`Q`bbcccj%(( 	Z	5-I-I 	j[dNdghNhNh>i9O > > >  
 i'' 	&)i/00FF	3'' 	&9%%Fj%(( 	(J233GG
C(( 	(J''G&(GG(FVi''69I  $     g,,Fa<<-) - - - -T^ - - -   8L7W33]o]q]qD|#%9#6#6#8#8 4q$'#IItCyy47I$$&&	--d33I  	,48T8\ 18484M4MNc4d4d1/7373L3LMa3b3b0<==GNN#?@@ )
  FC`  F  F  hD  F  F   #{!%!?!?(=Wt "@ " " !% > >(<Vr !? ! !	 	 	 	  #	I!- !efffIg..M9Wg.>??LL "-%TZ-@-E-E-G-GGG$%j,>%j%jdjNaNfNfNhNh%j%jkkk!$*"56H"I:VV $ BzGZ  B  B  qC  B  B  IM  NR  NX  Na  bt  Nu  Iv  Iv  I  B  B  B  $26A ,,W556HI7TZ`i  3 3/M<<
 ! 	$ 	$ 	$5zz%@@@(/<N / / /   $	$ (33CII>>*7F73 +Ffw6F,G Hkk!)$A/1 " 
 
 [[ )$@/0 ! 
 

 {[*EEFFFs   <V 
W(V??Wru  r  c                 p   d|cxk    r|k     sn t          d          |r[t          |           |z  }t          |           |z  }||z  t          ||          z   }	|	|z   ||k     rdndz   }
t          |	|
          }n#t	          j        |t          |           |          }|                     ||||          S )a	  Return the `index`-nth shard from dataset split into `num_shards` pieces.

        This shards deterministically. `dset.shard(n, i)` will contain all elements of dset whose
        index mod `n = i`.

        `dset.shard(n, i, contiguous=True)` will instead split dset into contiguous chunks,
        so it can be easily concatenated back together after processing. If `n % i == l`, then the
        first `l` shards will have length `(n // i) + 1`, and the remaining shards will have length `(n // i)`.
        `datasets.concatenate([dset.shard(n, i, contiguous=True) for i in range(n)])` will return
        a dataset with the same order as the original.

        Be sure to shard before using any randomizing operator (such as `shuffle`).
        It is best if the shard operator is used early in the dataset pipeline.


        Args:
            num_shards (`int`):
                How many shards to split the dataset into.
            index (`int`):
                Which shard to select and return.
            contiguous: (`bool`, defaults to `False`):
                Whether to select contiguous blocks of indices for shards.
            keep_in_memory (`bool`, defaults to `False`):
                Keep the dataset in memory instead of writing it to a cache file.
            indices_cache_file_name (`str`, *optional*):
                Provide the name of a path for the cache file. It is used to store the
                indices of each shard instead of the automatically generated cache file name.
            writer_batch_size (`int`, defaults to `1000`):
                Number of rows per write operation for the cache file writer.
                This value is a good trade-off between memory usage during the processing, and processing speed.
                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds
        Dataset({
            features: ['text', 'label'],
            num_rows: 1066
        })
        >>> ds.shard(num_shards=2, index=0)
        Dataset({
            features: ['text', 'label'],
            num_rows: 533
        })
        ```
        r   z$index should be in [0, num_shards-1]r   )r   r  r  r  )r   r   r   r   r   r  r  )rs   r  ru  r  r  r  r  divmodr  endr   s               rt   r	  zDataset.shard  s    t E&&&&J&&&&CDDD 	>d))z)Cd))j(C%K#eS//1E#+eckkq9CE3''GGis4yy*==G{{)$;/	  
 
 	
rv   tfrecordr   c                    	
 	 ddl n*# t          $ r t                              d           Y nw xY wfdfd	fd
dt          t
          t          t          t          j	        t          f         dd	f	
fd
fdfd} fd} j        dk    rt          d          |                    d          st          d          j        j                            |j        d          }j        j                            |          }t                              d|            |                    |           t                              d|            d dS )a  Writes the Arrow dataset to a TFRecord file.

        The dataset must already be in tensorflow format. The records will be written with
        keys from `dataset._format_columns`.

        Args:
            filename (`str`): The filename, including the `.tfrecord` extension, to write to.
            format (`str`, optional, default `"tfrecord"`): The type of output file. Currently this is a no-op, as
                TFRecords are the only option. This enables a more flexible function signature later.
        r   NzITensorflow needs to be installed to be able to return Tensorflow tensors.c                 l    j                             j                             |                     S )z2Returns a bytes_list from a list of string / byte.r   )
bytes_list)r  Feature	BytesListr  r   s    rt   _bytes_featurez&Dataset.export.<locals>._bytes_feature  /    8##rx/A/A/A/O/O#PPPrv   c                 l    j                             j                             |                     S )z3Returns a float_list from a list of float / double.r  )
float_list)r  r  	FloatListr  s    rt   _float_featurez&Dataset.export.<locals>._float_feature  r  rv   c                 l    j                             j                             |                     S )z>Returns an int64_list from a list of bool / enum / int / uint.r  )
int64_list)r  r  	Int64Listr  s    rt   _int64_featurez&Dataset.export.<locals>._int64_feature  r  rv   r  rz   ztf.train.Featurec                    t          | t                    rE| r0t          | d         t                    r d | D                       S t          d|  d          t          | t          j                  r| j        t	          j        t                    k    r |           S | j        t          j        k    r |           S | j        t	          j        t                    k    sP| j        t	          j        t                    k    rCt          |           dk    r0t          | d         t                    r d | D                       S t          d|  d| d         j         d          t          | d          rt	          j        | j        t          j                  r |                                 g          S t	          j        | j        t          j                  r |                                 g          S t	          j        | j        t                    r0 |                                                                 g          S t          d|  d	| j         d          t          d|  d
          )zCTypechecks `values` and returns the corresponding tf.train.Feature.r   c                 6    g | ]}|                                 S r   encoder   r  s     rt   r   z4Dataset.export.<locals>._feature.<locals>.<listcomp>       *F*F*F!188::*F*F*Frv   zvalues=z5 is empty or contains items that cannot be serializedc                 6    g | ]}|                                 S r   r*  r,  s     rt   r   z4Dataset.export.<locals>._feature.<locals>.<listcomp>  r-  rv   z2 is empty or is an np.ndarray with items of dtype z, which cannot be serializedr   z has dtype z> are not numpy objects or strings, and so cannot be serialized)r   r   r   r   r   r   r   r  r   objectr   r  r   floatingitemr   r+  )r  r  r#  r'  s    rt   _featurez Dataset.export.<locals>._feature  sq   &$'' s njC88 n)>*F*Fv*F*F*FGGG$%lv%l%l%lmmmFBJ// s<28E??22)>&111\RX--)>&111\RXc]]22LBHV$4$444VqZX^_`XacfMgMg)>*F*Fv*F*F*FGGG$ J&  J  J\bcd\e\k  J  J  J   )) 
s=r{;; n)>6;;==/:::]6<<< n)>6;;==/:::]6<55 n)>6;;==+?+?+A+A*BCCC$%lv%l%l&,%l%l%lmmm !q6!q!q!qrrrrv   c                     fd|                                  D             }j                            j                            |                    }|                                S )Nc                 .    i | ]\  }}| |          S r   r   )r   r   r   r2  s      rt   r   z=Dataset.export.<locals>.serialize_example.<locals>.<dictcomp>0  s'    III
UsHHUOOIIIrv   )r  rZ  )r   r  Exampler(   SerializeToString)exr  example_protor2  r   s      rt   serialize_examplez)Dataset.export.<locals>.serialize_example/  sb    IIIIbhhjjIIIGH,,bh6G6GPW6G6X6X,YYM 22444rv   c                 j                         | fj                  }                    |d          S )Nr   )py_functionr   reshape)r7  	tf_stringr9  r   s     rt   tf_serialize_examplez,Dataset.export.<locals>.tf_serialize_example4  s1    '82%KKI::i,,,rv   c               3   0   K   D ]}  |           V  d S rp   r   )r7  rs   r9  s    rt   r  z!Dataset.export.<locals>.generator8  s:       , ,''++++++, ,rv   r   z-Dataset format must be numpy before exportingz	.tfrecordz+filename {filename} must end with .tfrecordr   )output_typesoutput_shapeszWriting TFRecord to zFinished writing TFRecord to )r   r   r  r  r   r  r   r   r   r   r   r5  r   r  r#  r   r  r   r$  TFRecordWriterrm   r  )rs   r  r   r>  r  r(  rN  r  r2  r#  r'  r9  r   s   `      @@@@@@rt   exportzDataset.export  s-   	f##### 	f 	f 	fLLdeeeee	f	Q 	Q 	Q 	Q 	Q	Q 	Q 	Q 	Q 	Q	Q 	Q 	Q 	Q 	Q	sU5#sBJ#DE 	sJ\ 	s 	s 	s 	s 	s 	s 	s 	s@	5 	5 	5 	5 	5 	5
	- 	- 	- 	- 	- 	-	, 	, 	, 	, 	, 	, ''LMMM  -- 	LJKKKW_33IBIeg3hh
%44X>>5855666Z   >H>>???s    $55path_or_bufc                 L    ddl m}  || |f||d|                                S )aq  Exports the dataset to csv

        Args:
            path_or_buf (`PathLike` or `FileOrBuffer`):
                Either a path to a file or a BinaryIO.
            batch_size (`int`, *optional*):
                Size of the batch to load in memory and write at once.
                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
            num_proc (`int`, *optional*):
                Number of processes for multiprocessing. By default it doesn't
                use multiprocessing. `batch_size` in this case defaults to
                `datasets.config.DEFAULT_MAX_BATCH_SIZE` but feel free to make it 5x or 10x of the default
                value if you have sufficient compute power.
            **to_csv_kwargs (additional keyword arguments):
                Parameters to pass to pandas's [`pandas.DataFrame.to_csv`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html).

                <Changed version="2.10.0">

                Now, `index` defaults to `False` if not specified.

                If you would like to write the index, pass `index=True` and also set a name for the index column by
                passing `index_label`.

                </Changed>

        Returns:
            `int`: The number of characters or bytes written.

        Example:

        ```py
        >>> ds.to_csv("path/to/dataset/directory")
        ```
        r   )CsvDatasetWriterr   r  )r  rF  r  )rs   rD  r   r  to_csv_kwargsrF  s         rt   to_csvzDataset.to_csvG  sG    T 	-,,,,,kmjS[mm_lmmssuuurv   c           	      j    |dk    rt          j        dt                     nd}|sRt           j        t          dt                                j         j        nd                                          S rnt          j
         fdt          dt                               D             S )a0  Returns the dataset as a Python dict. Can also return a generator for large datasets.

        Args:
            batched (`bool`):
                Set to `True` to return a generator that yields the dataset as batches
                of `batch_size` rows. Defaults to `False` (returns the whole datasets once).

                <Deprecated version="2.11.0">

                Use `.iter(batch_size=batch_size)` followed by `.to_dict()` on the individual batches instead.

                </Deprecated>

            batch_size (`int`, *optional*): The size (number of rows) of the batches if `batched` is `True`.
                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.

        Returns:
            `dict` or `Iterator[dict]`

        Example:

        ```py
        >>> ds.to_dict()
        ```
        r  z'batched' was deprecated in version 2.11.0 and will be removed in version 3.0.0. Use `.iter(batch_size=batch_size)` followed by `.to_dict()` on the individual batches instead.Fr   Nr  c           	   3      K   | ]L}t          j        t          ||z             j        j        nd                                           V  Md S )Nr  )rA   r  r  r  	to_pydictr   rX  r   rs   s     rt   rP  z"Dataset.to_dict.<locals>.<genexpr>  sz          *ffz&9::-1]-FDMMD   )++     rv   )r  r  r  rA   r  r  r   r  rL  r    rF  r   rs   r   r  s   `` rt   to_dictzDataset.to_dictu  s    4 l""M B   
 G 	j!SYY'')-)B   ikk	 (2Tv7TJ     $As4yy*==   rv   c           	          t          | j        t          dt          |                     | j        | j        nd                                          S )zReturns the dataset as a Python list.

        Returns:
            `list`

        Example:

        ```py
        >>> ds.to_list()
        ```
        r   Nr  )rA   r  r  r   r  r  rx   s    rt   to_listzDataset.to_list  sN     *aT##%)]%>DMMD
 
 
 )++		rv   c                 L    ddl m}  || |f||d|                                S )aT  Export the dataset to JSON Lines or JSON.

        Args:
            path_or_buf (`PathLike` or `FileOrBuffer`):
                Either a path to a file or a BinaryIO.
            batch_size (`int`, *optional*):
                Size of the batch to load in memory and write at once.
                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
            num_proc (`int`, *optional*):
                Number of processes for multiprocessing. By default it doesn't
                use multiprocessing. `batch_size` in this case defaults to
                `datasets.config.DEFAULT_MAX_BATCH_SIZE` but feel free to make it 5x or 10x of the default
                value if you have sufficient compute power.
            **to_json_kwargs (additional keyword arguments):
                Parameters to pass to pandas's [`pandas.DataFrame.to_json`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html).

                <Changed version="2.11.0">

                Now, `index` defaults to `False` if `orint` is  `"split"` or `"table"` is  specified.

                If you would like to write the index, pass `index=True`.

                </Changed>

        Returns:
            `int`: The number of characters or bytes written.

        Example:

        ```py
        >>> ds.to_json("path/to/dataset/directory")
        ```
        r   )JsonDatasetWriterrG  )r  rS  r  )rs   rD  r   r  to_json_kwargsrS  s         rt   to_jsonzDataset.to_json  sG    R 	/.....  {ozT\oo`noouuwwwrv   c           	      2    |sYt           j        t          dt                                j         j        nd                              t                    S rnt          j         fdt          dt                               D             S )a  Returns the dataset as a `pandas.DataFrame`. Can also return a generator for large datasets.

        Args:
            batched (`bool`):
                Set to `True` to return a generator that yields the dataset as batches
                of `batch_size` rows. Defaults to `False` (returns the whole datasets once).
            batch_size (`int`, *optional*):
                The size (number of rows) of the batches if `batched` is `True`.
                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.

        Returns:
            `pandas.DataFrame` or `Iterator[pandas.DataFrame]`

        Example:

        ```py
        >>> ds.to_pandas()
        ```
        r   Nr  types_mapperc           	   3      K   | ]S}t          j        t          ||z             j        j        nd                               t
                    V  Td S )Nr  rW  )rA   r  r  r  	to_pandasr/   rM  s     rt   rP  z$Dataset.to_pandas.<locals>.<genexpr>  s          *ffz&9::-1]-FDMMD   ))<)==     rv   )
rA   r  r  r   r  rZ  r/   r    rF  r   rN  s   `` rt   rZ  zDataset.to_pandas  s    ,  	j!SYY'')-)B   i%8i99	: (2Tv7TJ     $As4yy*==   rv   c                 J    ddl m}  || |fd|i|                                S )a  Exports the dataset to parquet

        Args:
            path_or_buf (`PathLike` or `FileOrBuffer`):
                Either a path to a file or a BinaryIO.
            batch_size (`int`, *optional*):
                Size of the batch to load in memory and write at once.
                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
            **parquet_writer_kwargs (additional keyword arguments):
                Parameters to pass to PyArrow's `pyarrow.parquet.ParquetWriter`.

        Returns:
            `int`: The number of characters or bytes written.

        Example:

        ```py
        >>> ds.to_parquet("path/to/dataset/directory")
        ```
        r   )ParquetDatasetWriterr   )r  r\  r  )rs   rD  r   parquet_writer_kwargsr\  s        rt   
to_parquetzDataset.to_parquet  sC    6 	544444##D+ff*fPeffllnnnrv   r  c                 L    ddl m}  || ||fd|i|                                S )a  Exports the dataset to a SQL database.

        Args:
            name (`str`):
                Name of SQL table.
            con (`str` or `sqlite3.Connection` or `sqlalchemy.engine.Connection` or `sqlalchemy.engine.Connection`):
                A [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) or a SQLite3/SQLAlchemy connection object used to write to a database.
            batch_size (`int`, *optional*):
                Size of the batch to load in memory and write at once.
                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
            **sql_writer_kwargs (additional keyword arguments):
                Parameters to pass to pandas's [`pandas.DataFrame.to_sql`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html).

                <Changed version="2.11.0">

                Now, `index` defaults to `False` if not specified.

                If you would like to write the index, pass `index=True` and also set a name for the index column by
                passing `index_label`.

                </Changed>

        Returns:
            `int`: The number of records written.

        Example:

        ```py
        >>> # con provided as a connection URI string
        >>> ds.to_sql("data", "sqlite:///my_own_db.sql")
        >>> # con provided as a sqlite3 connection object
        >>> import sqlite3
        >>> con = sqlite3.connect("my_own_db.sql")
        >>> with con:
        ...     ds.to_sql("data", con)
        ```
        r   )SqlDatasetWriterr   )r  r`  r  )rs   r  r  r   sql_writer_kwargsr`  s         rt   to_sqlzDataset.to_sql-  sF    Z 	-,,,,,dC\\J\J[\\bbdddrv   c                    | j         j        }d | j        j                                        D             }|r`dfd}|                     d          d d         }t          ||           t          | j                   z  t          |          z  |z   }| j        ,|t          | j                  z  t          | j                   z  }|S )Nc                 :    g | ]\  }}t          |d           |S T)ignore_decode_attributer0   r   rO  r  s      rt   r   z,Dataset._estimate_nbytes.<locals>.<listcomp>c  s@     
 
 
!Q9I!ei9j9j9j

 
 
rv   r   c                    t          |t          t          f          rb|                                 D ].}|*|d         "|d         t	          |d                   }|z  /|                     d          j        z  d S d S )Nbytesr  )r   r&   r)   r  r%   r  nbytes)r   r  r7  rv  extra_nbytess       rt   extra_nbytes_visitorz6Dataset._estimate_nbytes.<locals>.extra_nbytes_visitork  s    gu~66 ?"__.. 1 1=QwZ-?AfIDY#+AfI#6#6D(D0L EKK$7$7$>>LLL? ?rv   rE  r  )	r#  rk  rq   r   r   r!  rU   r   r  )rs   r2  decodable_columnsrm  rU  rl  s        @rt   r  zDataset._estimate_nbytes^  s    )
 
*-3355
 
 
  	;L? ? ? ? ? $$W--ete4E%!5666'#di..83u::EL+l:N=$+c$-.@.@@3ty>>QNrv   r[  c              #     K   t                      }t          |           D ]\  }}d}|                    d                              t          j                  D ]f}|                    |          }t          t          |                    D ]2fd|	                                D             }| d| |fV  |dz  }3gd S )Nr   rE  c                 (    i | ]\  }}||         S r   r   r  s      rt   r   z:Dataset._generate_examples_from_shards.<locals>.<dictcomp>  s#    MMMesE!HMMMrv   r   r   )
r=   r  r!  rH  r    r  format_batchr   r   r   )	r[  python_formatter
shards_idxr	  example_idxr  r   r  r   s	           @rt   _generate_examples_from_shardsz&Dataset._generate_examples_from_shards~  s      *,,!*6!2!2 	% 	%JK!--g66;;F<jkk % %(55h??s8}}-- % %AMMMMu{{}}MMMG'77+77@@@@1$KK%%	% 	%rv   r   rj   c                     ddl m}m}  j        t	          d          t                     k    r#t          dt                      d d           j        t          	                    d           dk    rt          j                   gn fd	t                    D             } |t          j        d
|i          } ||t           j                            S )a  Get an [`datasets.IterableDataset`] from a map-style [`datasets.Dataset`].
        This is equivalent to loading a dataset in streaming mode with [`datasets.load_dataset`], but much faster since the data is streamed from local files.

        Contrary to map-style datasets, iterable datasets are lazy and can only be iterated over (e.g. using a for loop).
        Since they are read sequentially in training loops, iterable datasets are much faster than map-style datasets.
        All the transformations applied to iterable datasets like filtering or processing are done on-the-fly when you start iterating over the dataset.

        Still, it is possible to shuffle an iterable dataset using [`datasets.IterableDataset.shuffle`].
        This is a fast approximate shuffling that works best if you have multiple shards and if you specify a buffer size that is big enough.

        To get the best speed performance, make sure your dataset doesn't have an indices mapping.
        If this is the case, the data are not read contiguously, which can be slow sometimes.
        You can use `ds = ds.flatten_indices()` to write your dataset in contiguous chunks of data and have optimal speed before switching to an iterable dataset.

        Args:
            num_shards (`int`, default to `1`):
                Number of shards to define when instantiating the iterable dataset. This is especially useful for big datasets to be able to shuffle properly,
                and also to enable fast parallel loading using a PyTorch DataLoader or in distributed setups for example.
                Shards are defined using [`datasets.Dataset.shard`]: it simply slices the data without writing anything on disk.

        Returns:
            [`datasets.IterableDataset`]

        Example:

        Basic usage:
        ```python
        >>> ids = ds.to_iterable_dataset()
        >>> for example in ids:
        ...     pass
        ```

        With lazy filtering and processing:
        ```python
        >>> ids = ds.to_iterable_dataset()
        >>> ids = ids.filter(filter_fn).map(process_fn)  # will filter and process on-the-fly when you start iterating over the iterable dataset
        >>> for example in ids:
        ...     pass
        ```

        With sharding to enable efficient shuffling:
        ```python
        >>> ids = ds.to_iterable_dataset(num_shards=64)  # the dataset is split into 64 shards to be iterated over
        >>> ids = ids.shuffle(buffer_size=10_000)  # will shuffle the shards order and use a shuffle buffer for fast approximate shuffling when you start iterating
        >>> for example in ids:
        ...     pass
        ```

        With a PyTorch DataLoader:
        ```python
        >>> import torch
        >>> ids = ds.to_iterable_dataset(num_shards=64)
        >>> ids = ids.filter(filter_fn).map(process_fn)
        >>> dataloader = torch.utils.data.DataLoader(ids, num_workers=4)  # will assign 64 / 4 = 16 shards to each worker to load, filter and process when you start iterating
        >>> for example in ids:
        ...     pass
        ```

        With a PyTorch DataLoader and shuffling:
        ```python
        >>> import torch
        >>> ids = ds.to_iterable_dataset(num_shards=64)
        >>> ids = ids.shuffle(buffer_size=10_000)  # will shuffle the shards order and use a shuffle buffer when you start iterating
        >>> dataloader = torch.utils.data.DataLoader(ids, num_workers=4)  # will assign 64 / 4 = 16 shards from the shuffled list of shards to each worker when you start iterating
        >>> for example in ids:
        ...     pass
        ```

        In a distributed setup like PyTorch DDP with a PyTorch DataLoader and shuffling
        ```python
        >>> from datasets.distributed import split_dataset_by_node
        >>> ids = ds.to_iterable_dataset(num_shards=512)
        >>> ids = ids.shuffle(buffer_size=10_000)  # will shuffle the shards order and use a shuffle buffer when you start iterating
        >>> ids = split_dataset_by_node(ds, world_size=8, rank=0)  # will keep only 512 / 8 = 64 shards from the shuffled lists of shards when you start iterating
        >>> dataloader = torch.utils.data.DataLoader(ids, num_workers=4)  # will assign 64 / 4 = 16 shards from this node's list of shards to each worker when you start iterating
        >>> for example in ids:
        ...     pass
        ```

        With shuffling and multiple epochs:
        ```python
        >>> ids = ds.to_iterable_dataset(num_shards=64)
        >>> ids = ids.shuffle(buffer_size=10_000, seed=42)  # will shuffle the shards order and use a shuffle buffer when you start iterating
        >>> for epoch in range(n_epochs):
        ...     ids.set_epoch(epoch)  # will use effective_seed = seed + epoch to shuffle the shards and for the shuffle buffer when you start iterating
        ...     for example in ids:
        ...         pass
        ```
        Feel free to also use [`IterableDataset.set_epoch`] when using a PyTorch DataLoader or in distributed setups.
        r   )ExamplesIterablerj   NzConverting a formatted dataset to a formatted iterable dataset is not implemented yet. Please run `my_dataset = my_dataset.with_format(None)` before calling to_iterable_datasetz"Unable to shard a dataset of size z into z= shards (the number of shards exceeds the number of samples).zConverting an Arrow dataset to iterable but it has an indices mapping that can make it slower. You can use `ds = ds.flatten_indices()` to write your dataset in contiguous chunks of data and have optimal speed.c                 @    g | ]}                     |d           S )Tr  r  )r   r  r  rs   s     rt   r   z/Dataset.to_iterable_dataset.<locals>.<listcomp>  s8       Xa

j	d
SS  rv   r[  )r<  rZ  rm   )iterable_datasetrw  rj   r5  NotImplementedErrorr   r   r  r  rm   r   r   r   r   ru  rD   r   )rs   r  rw  rj   r[  ex_iterables   ``    rt   to_iterable_datasetzDataset.to_iterable_dataset  sV   v 	HGGGGGGG(% C   D		!! PSYY  P  Pj  P  P  P   =$KKE   Q ]4  !!    ejkuevev   	 '&w'MW_agVhiii{dm1T1T1TUUUUrv   repo_idprivatetokenbranchembed_external_filesc	                 B    ! |t          d          t          t          j                   nt	          j                    t          d           j        t           j                  ndt          j
        t                    st          dt           d d                              d	          }	t          |	          d
k    rt          d|	 d          t          |	          dk    r*|	d         }
                               d         }| d	|
                      d|d           |r(d  j        j                                        D             ng }                                 }@t'          |pt          j                  }t+          ||z            dz   t-          d           fdt/                    D             }|rd } ||          }                     d          }d |D             }fd}t3          |          }t5          |          } |d|          }||v r-t          |          k     rt6                              d           d}g !t;          j        t?          tA          j!        |g|                    dt;          j"                               D ]\  }} |||          }||vrttG                      }|$                    |           ||%                                z  }tM           j'        |(                                |ddtR          dgddd !           !*                    |           !fd"|D             }tW          fd#|D                       } fd$}t          |          rEt;          j        |d%t          |          t;          j"                               D ]} ||           tY          t[          |          t[          |          z
            }||||fS )&a
  Pushes the dataset to the hub.
        The dataset is pushed using HTTP requests and does not need to have neither git or git-lfs installed.

        Args:
            repo_id (`str`):
                The ID of the repository to push to in the following format: `<user>/<dataset_name>` or
                `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace
                of the logged-in user.
            split (Optional, `str`):
                The name of the split that will be given to that dataset. Defaults to `self.split`.
            private (Optional `bool`, defaults to `False`):
                Whether the dataset repository should be set to private or not. Only affects repository creation:
                a repository that already exists will not be affected by that parameter.
            token (Optional `str`):
                An optional authentication token for the Hugging Face Hub. If no token is passed, will default
                to the token saved locally when logging in with ``huggingface-cli login``. Will raise an error
                if no token is passed and the user is not logged-in.
            branch (Optional `str`):
                The git branch on which to push the dataset. This defaults to the default branch as specified
                in your repository, which defaults to `"main"`.
            max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`):
                The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a
                a unit (like `"5MB"`).
            num_shards (`int`, *optional*):
                Number of shards to write. By default the number of shards depends on `max_shard_size`.

                <Added version="2.8.0"/>
            embed_external_files (`bool`, default ``True``):
                Whether to embed file bytes in the shards.
                In particular, this will do the following before the push for the fields of type:

                - :class:`Audio` and class:`Image`: remove local path information and embed file content in the Parquet files.

        Returns:
            repo_id (`str`): ID of the repository in <user>/<dataset_name>` or `<org>/<dataset_name>` format
            split (`str`): name of the uploaded split
            uploaded_size (`int`): number of uploaded bytes to the repository
            dataset_nbytes (`int`): approximate size in bytes of the uploaded dataset afer uncompression
            repo_files (`str`): list of files in the repository
            deleted_size (`int`): number of deleted bytes in the repository

        Example:

        ```python
        >>> dataset.push_to_hub("<organization>/<dataset_id>", split="evaluation")
        ```
        Nr  endpointz[You need to provide a `token` or be logged in to Hugging Face with `huggingface-cli login`.r  zSplit name should match 'z' but got 'z'.r  r  zQThe identifier should be in the format <repo_id> or <namespace>/<repo_id>. It is z), which doesn't conform to either format.r   r   r  r   T)r  	repo_typer  r  c                 :    g | ]\  }}t          |d           |S re  rg  rh  s      rt   r   z7Dataset._push_parquet_shards_to_hub.<locals>.<listcomp>b  s0    lll41a:J1fj:k:k:klQlllrv   c              3   H   K   | ]}                     |d           V  dS )Tr  Nr  )r   r   r  rs   s     rt   rP  z6Dataset._push_parquet_shards_to_hub.<locals>.<genexpr>n  s6      iiRS$**
!*MMiiiiiirv   c              3      K   | D ]M}|j         }|                    d          }|                    t          ddd          } |j        di |}|V  Nd S )NrE  Tr  )r  r   r  r   )r   r!  r"  rQ   )r[  r	  r   s      rt   #shards_with_embedded_external_fileszPDataset._push_parquet_shards_to_hub.<locals>.shards_with_embedded_external_filesr  s      # 
  
 E"\F!--g66E!II+ $#''+	 &  E .E-7777EKKKK
  
 rv   )r  revisionr  c                 <    g | ]}|                     d           |S )data/r  )r   rZ  s     rt   r   z7Dataset._push_parquet_shards_to_hub.<locals>.<listcomp>  s)    IIIt0H0HIdIIIrv   c           	      0    d d| dddd|j          d	S )Nr  -r  r  z.parquetr_  )_indexr	  r  rn   s     rt   path_in_repoz9Dataset._push_parquet_shards_to_hub.<locals>.path_in_repo  s5    `5``6```:```EDV````rv   z&Resuming upload of the dataset shards.z)Pushing dataset shards to the dataset hub)r  r  r   path_or_fileobjr  r~  r  r  r  i  g       @   g      4@)func_kwargs
exceptionsstatus_codesbase_wait_timemax_retriesmax_wait_timec                 N    g | ]!}|                     d  d          r|v|"S )r  r  r  )r   r^  shards_path_in_reporn   s     rt   r   z7Dataset._push_parquet_shards_to_hub.<locals>.<listcomp>  sT      
  
  
##$4E$4$4$455 
 ;DK^:^:^ :^:^:^rv   c              3   V   K   | ]#}t          t          |                     V  $dS ))use_auth_tokenN)r%   r[   )r   r^  r~  r  s     rt   rP  z6Dataset._push_parquet_shards_to_hub.<locals>.<genexpr>  sK       
 
OXHZ33EJJJ
 
 
 
 
 
rv   c                 <                         | d           d S )Nr   )r~  r  r  r  )delete_file)rZ  apir  r~  r  s    rt   r  z8Dataset._push_parquet_shards_to_hub.<locals>.delete_file  s&    OOD')^dOeeeeerv   z-Deleting unused files from dataset repository).r   r   r    HF_ENDPOINTr   	get_tokenr  rn   r   rematchrF   r   whoamicreate_reporq   r   r   r  r_   r  r   r  r   list_repo_filesrH  r  r  r  rW   r'  r  r  chainr(  r   r^  tellrX   upload_filer  r   r   rY  r   r   )"rs   r~  rn   r  r  r  r  r  r  
identifierdataset_nameorganization_or_usernamern  r2  r[  r  r"  
data_filesr  shards_iterfirst_shardfirst_shard_path_in_repouploaded_sizeru  r	  shard_path_in_repor  data_files_to_deletedeleted_sizer  r^  
repo_filesr  r  s"   ``` `` `                        @@rt   _push_parquet_shards_to_hubz#Dataset._push_parquet_shards_to_hub  s^   t %**@j   V/000*0B0D0D="m   ='+z'=C
OOO7Ex	5)) 	ZXXXuXXXYYY]]3''
z??Q:dn : : :   __!!%a=L'*zz%'8'8'@$1BBLBBG 	 	
 	
 	
 $ll4:.4466llll 	 ..005n6]H]^^N^n<==AJZ++JiiiiiW\]gWhWhiii 	A      98@@F##Gy6Y^#__IIuIII
	a 	a 	a 	a 	a 	a 6ll;''#/<;#?#? #z11j3z??6R6RNNCDDD #Lio{m[AABB<7999	
 
 
 	; 	;LE5 ".eU!;!;!33   (((.O+1??+<+<(:#*!&%.$*! !  )"%#& !"&      &&'9:::: 
  
  
  
  
' 
  
  

  
 
 
 
 
\p
 
 
 
 
	f 	f 	f 	f 	f 	f 	f 	f #$$ 	'$\$D.//#;===	   ' '	 I&&&&#e**s+?'@'@@AA
}nj,VVrv   c	           
      	   ||t          d          |                     ||||||||          \  }}}	}
}}|                    d          \  }}| j                                        }d|_        |	|_        |
|_        |	|
z   |_        t          |t          ||
t          |           |          i          |_        d|v rt                      }d|_        ||_        t!          t#          |d          |          }t%          j        t)          |                    }t+          j        |          }|r#|t/          t1          |                             }nd}nt2          j        |v rt%                      }t                      }d|_        ||_        t!          t#          |t2          j                  |          }t7          |d	
          5 }t9          j        |          }|r5t=          j        |t/          t1          |                                       }nd}ddd           n# 1 swxY w Y   nt%                      }d}|0t@          !                    d           |j        rtE          |j                  |gk    r| j#        j$        |j$        k    r$t          d| j#        j$         d|j$                   ||j        v rL|xj        |z  c_        |xj        |j        %                    |t                                j&        pdz  c_        d|_        |j        pd|	z   |_        |j        pd|
z   |_        |j        |j        z   |_        t          ||
t          |           |          |j        |<   |}t2          j        |v rtO                      }|(                    d           |)                    |d           |(                    d           tU          t2          j+                  ,                    |-                                t2          j        ||d|           t+          d|i          .                    |           d|v r>t7          |d	
          5 }|/                                }ddd           n# 1 swxY w Y   nd|                    d          d          d}tU          t2          j+                  ,                    |0                    |          1                                d||d|           dS )a  Pushes the dataset to the hub as a Parquet dataset.
        The dataset is pushed using HTTP requests and does not need to have neither git or git-lfs installed.

        The resulting Parquet files are self-contained by default. If your dataset contains [`Image`] or [`Audio`]
        data, the Parquet files will store the bytes of your images or audio files.
        You can disable this by setting `embed_external_files` to `False`.

        Args:
            repo_id (`str`):
                The ID of the repository to push to in the following format: `<user>/<dataset_name>` or
                `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace
                of the logged-in user.
            split (`str`, *optional*):
                The name of the split that will be given to that dataset. Defaults to `self.split`.
            private (`bool`, *optional*, defaults to `False`):
                Whether the dataset repository should be set to private or not. Only affects repository creation:
                a repository that already exists will not be affected by that parameter.
            token (`str`, *optional*):
                An optional authentication token for the Hugging Face Hub. If no token is passed, will default
                to the token saved locally when logging in with `huggingface-cli login`. Will raise an error
                if no token is passed and the user is not logged-in.
            branch (`str`, *optional*):
                The git branch on which to push the dataset. This defaults to the default branch as specified
                in your repository, which defaults to `"main"`.
            max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`):
                The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by
                a unit (like `"5MB"`).
            num_shards (`int`, *optional*): Number of shards to write. By default the number of shards depends on `max_shard_size`.

                <Added version="2.8.0"/>
            embed_external_files (`bool`, defaults to `True`):
                Whether to embed file bytes in the shards.
                In particular, this will do the following before the push for the fields of type:

                - [`Audio`] and [`Image`]: remove local path information and embed file content in the Parquet files.

        Example:

        ```python
        >>> dataset.push_to_hub("<organization>/<dataset_id>")
        >>> dataset.push_to_hub("<organization>/<dataset_id>", split="validation")
        >>> dataset.push_to_hub("<organization>/<dataset_id>", max_shard_size="1GB")
        >>> dataset.push_to_hub("<organization>/<dataset_id>", num_shards=1024)
        ```
        Nr  )r~  rn   r  r  r  r  r  r  r  )rQ  rP  r  z	README.mdzDownloading metadata)download_configr  r  z0Updating downloaded metadata with the new split.zVFeatures of the new split don't match the features of the existing splits on the hub: z != r   s   {"default": T)pretty_print   }r  r   r  defaultz# Dataset Card for "z"

[More Information needed](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards))2r   r  rn   rm   r   r   r   r   r   rJ   rK   r   splitsr$   download_descr  rY   r[   r]   from_readmer   rE   from_metadatar  rH  r    DATASETDICT_INFOS_FILENAMEr-  r_  rh  rD   r  r  r  r   rq   r   rN  rQ  r   r  
_dump_infor   r  r  r  to_metadatar  
_to_readmer+  )rs   r~  rn   r  r  r  r  r  r  r  r2  r  r  organizationr  info_to_dumpr  dataset_readme_pathdataset_metadatadataset_infos	repo_infodataset_infos_pathrP  r  readme_filereadme_contents                             rt   push_to_hubzDataset.push_to_hub  s   p %**@j   SWRrRr)!!5 Ss 	S
 	S
O~z< &-]]3%7%7"ly~~''*.'%2"$2!%2^%C"'Ie~CPTIIdpqqqr
 
 *$$,..O,BO)-2O*"-7K00 /# # #  /:4@S;T;TUU.>.LM].^.^M !)$tM/B/B*C*CD		 		.*<<.00,..O,BO)-2O*!,7F$EFF /" " " (7;;; %q26)A,,  % + 5mDmI\I\D]D]6^ _ _II $I% % % % % % % % % % % % % % %  /00I NNMNNN )D)9$:$:ug$E$E:&)*<<<$ _quq{  rE  _  _  KT  K]  _  _   I,,,++|;++**i.>.B.B5)++.V.V.`.edee**/3	,+4+B+Ga=*X	'*3*@*EA)W	&*3*AIDZ*Z	'*3^#d))Zf+ + +	 '  ),
::YYFLL)))##F#>>>LL6-...:: & 1 1#># ;    	)\233??@PQQQ*$$)G<<< 4!,!1!1!3!34 4 4 4 4 4 4 4 4 4 4 4 4 4 4 UGMM#4F4Fr4J  U  U  UNv)***66,77GGNNPP$ 	7 	
 	
 	
 	
 	
s%   
AH$$H(+H(Q99Q= Q=c                    t          j        ||i          }t          | j        j        |j        z              | j        |                                 n| }t          |j        |gd          }|j        	                                }|j
                            t          j        |j                             t          ||j
                  }t!          ||| j        d|          S )al  Add column to Dataset.

        <Added version="1.7"/>

        Args:
            name (`str`):
                Column name.
            column (`list` or `np.array`):
                Column data to be added.

        Returns:
            [`Dataset`]

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> more_text = ds["text"]
        >>> ds.add_column(name="text_2", column=more_text)
        Dataset({
            features: ['text', 'label', 'text_2'],
            num_rows: 1066
        })
        ```
        Nr   axisr  )rL   r  rr  r  r3  r  r  rP   rm   r   r   r,  r(   r  r\  re  r   rn   )rs   r  r   r  column_tabler   rU  rm   s           rt   
add_columnzDataset.add_columnl  s    : %0$@@DJ3l6OOPPP,0M,E$&&(((4w}l;!DDD|  ""X78KLLMMM-eT]CCu4tz[jkkkkrv   
index_namedevicestring_factorymetric_typecustom_indexzfaiss.Indexfaiss_verbosec                     |                      d|g|
          5  t                                          |||||||||		  	         ddd           n# 1 swxY w Y   | S )a  Add a dense index using Faiss for fast retrieval.
        By default the index is done over the vectors of the specified column.
        You can specify `device` if you want to run it on GPU (`device` must be the GPU index).
        You can find more information about Faiss here:

        - For [string factory](https://github.com/facebookresearch/faiss/wiki/The-index-factory)

        Args:
            column (`str`):
                The column of the vectors to add to the index.
            index_name (`str`, *optional*):
                The `index_name`/identifier of the index.
                This is the `index_name` that is used to call [`~datasets.Dataset.get_nearest_examples`] or [`~datasets.Dataset.search`].
                By default it corresponds to `column`.
            device (`Union[int, List[int]]`, *optional*):
                If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
                If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
            string_factory (`str`, *optional*):
                This is passed to the index factory of Faiss to create the index.
                Default index class is `IndexFlat`.
            metric_type (`int`, *optional*):
                Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.
            custom_index (`faiss.Index`, *optional*):
                Custom Faiss index that you already have instantiated and configured for your needs.
            batch_size (`int`):
                Size of the batch to use while adding vectors to the `FaissIndex`. Default value is `1000`.
                <Added version="2.4.0"/>
            train_size (`int`, *optional*):
                If the index needs a training step, specifies how many vectors will be used to train the index.
            faiss_verbose (`bool`, defaults to `False`):
                Enable the verbosity of the Faiss index.
            dtype (`data-type`):
                The dtype of the numpy arrays that are indexed.
                Default is `np.float32`.

        Example:

        ```python
        >>> ds = datasets.load_dataset('crime_and_punish', split='train')
        >>> ds_with_embeddings = ds.map(lambda example: {'embeddings': embed(example['line']}))
        >>> ds_with_embeddings.add_faiss_index(column='embeddings')
        >>> # query
        >>> scores, retrieved_examples = ds_with_embeddings.get_nearest_examples('embeddings', embed('my new query'), k=10)
        >>> # save index
        >>> ds_with_embeddings.save_faiss_index('embeddings', 'my_index.faiss')

        >>> ds = datasets.load_dataset('crime_and_punish', split='train')
        >>> # load index
        >>> ds.load_faiss_index('embeddings', 'my_index.faiss')
        >>> # query
        >>> scores, retrieved_examples = ds.get_nearest_examples('embeddings', embed('my new query'), k=10)
        ```
        r   )r   r   r   )	r   r  r  r  r  r  r   r   r  N)r  r  add_faiss_index)rs   r   r  r  r  r  r  r   r   r  r   r  s              rt   r  zDataset.add_faiss_index  s    D GfXUKK 	 	GG##%-')%%+ $ 
 
 
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 s   +AAAexternal_arraysc                     t                                          |                    |
          ||||||||		  	         dS )a  Add a dense index using Faiss for fast retrieval.
        The index is created using the vectors of `external_arrays`.
        You can specify `device` if you want to run it on GPU (`device` must be the GPU index).
        You can find more information about Faiss here:

        - For [string factory](https://github.com/facebookresearch/faiss/wiki/The-index-factory)

        Args:
            external_arrays (`np.array`):
                If you want to use arrays from outside the lib for the index, you can set `external_arrays`.
                It will use `external_arrays` to create the Faiss index instead of the arrays in the given `column`.
            index_name (`str`):
                The `index_name`/identifier of the index.
                This is the `index_name` that is used to call [`~datasets.Dataset.get_nearest_examples`] or [`~datasets.Dataset.search`].
            device (Optional `Union[int, List[int]]`, *optional*):
                If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
                If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
            string_factory (`str`, *optional*):
                This is passed to the index factory of Faiss to create the index.
                Default index class is `IndexFlat`.
            metric_type (`int`, *optional*):
                Type of metric. Ex: `faiss.faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.
            custom_index (`faiss.Index`, *optional*):
                Custom Faiss index that you already have instantiated and configured for your needs.
            batch_size (`int`, *optional*):
                Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000.
                <Added version="2.4.0"/>
            train_size (`int`, *optional*):
                If the index needs a training step, specifies how many vectors will be used to train the index.
            faiss_verbose (`bool`, defaults to False):
                Enable the verbosity of the Faiss index.
            dtype (`numpy.dtype`):
                The dtype of the numpy arrays that are indexed. Default is np.float32.
        )	r  r  r  r  r  r  r   r   r  N)r  $add_faiss_index_from_external_arraysr  )rs   r  r  r  r  r  r  r   r   r  r   r  s              rt   r  z,Dataset.add_faiss_index_from_external_arrays  sZ    ^ 	44+22599!)#%!!' 	5 
	
 
	
 
	
 
	
 
	
rv   hostport	es_clientzelasticsearch.Elasticsearches_index_namees_index_configc           
          |                      d|g          5  t                                          |||||||           ddd           n# 1 swxY w Y   | S )a  Add a text index using ElasticSearch for fast retrieval. This is done in-place.

        Args:
            column (`str`):
                The column of the documents to add to the index.
            index_name (`str`, *optional*):
                The `index_name`/identifier of the index.
                This is the index name that is used to call [`~Dataset.get_nearest_examples`] or [`Dataset.search`].
                By default it corresponds to `column`.
            host (`str`, *optional*, defaults to `localhost`):
                Host of where ElasticSearch is running.
            port (`str`, *optional*, defaults to `9200`):
                Port of where ElasticSearch is running.
            es_client (`elasticsearch.Elasticsearch`, *optional*):
                The elasticsearch client used to create the index if host and port are `None`.
            es_index_name (`str`, *optional*):
                The elasticsearch index name used to create the index.
            es_index_config (`dict`, *optional*):
                The configuration of the elasticsearch index.
                Default config is:
                    ```
                    {
                        "settings": {
                            "number_of_shards": 1,
                            "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
                        },
                        "mappings": {
                            "properties": {
                                "text": {
                                    "type": "text",
                                    "analyzer": "standard",
                                    "similarity": "BM25"
                                },
                            }
                        },
                    }
                    ```
        Example:

        ```python
        >>> es_client = elasticsearch.Elasticsearch()
        >>> ds = datasets.load_dataset('crime_and_punish', split='train')
        >>> ds.add_elasticsearch_index(column='line', es_client=es_client, es_index_name="my_es_index")
        >>> scores, retrieved_examples = ds.get_nearest_examples('line', 'my new query', k=10)
        ```
        N)r   r   )r   r  r  r  r  r  r  )r  r  add_elasticsearch_index)	rs   r   r  r  r  r  r  r  r  s	           rt   r  zDataset.add_elasticsearch_index  s    p D6(;; 		 		GG++%#+ / ,   		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 s   )AAAr1  c                 ,   t          j        d |                                D                       }t          | j        j        t          j        |j                  g          \  }}t          | j        j        |k    r| j
                            |j                  n| j
        |                    |j                  g          }| j        d}nht          j        t!          | j
                  gt          j                              }t          j        |gdg          }	t          | j        |	g          }| j                                        }
|
j                            |           t-          ||
j                  }t/          ||
| j        ||          S )aU  Add item to Dataset.

        <Added version="1.7"/>

        Args:
            item (`dict`):
                Item data to be added.

        Returns:
            [`Dataset`]

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> new_review = {'label': 0, 'text': 'this movie is the absolute worst thing I have ever seen'}
        >>> ds = ds.add_item(new_review)
        >>> ds[-1]
        {'label': 0, 'text': 'this movie is the absolute worst thing I have ever seen'}
        ```
        c                     i | ]	\  }}||g
S r   r   rh  s      rt   r   z$Dataset.add_item.<locals>.<dictcomp>|  s     /P/P/P41aA3/P/P/Prv   Nrx  r   r  r  )rL   r  r   r,   rq   r   r(   r  r\  rP   r  r  r  r  rh  r   r   r  r  rm   r   r,  re  r   rn   )rs   r1  r  
item_tabledset_featuresitem_featuresrU  r|  item_indices_arrayitem_indices_tablerm   s              rt   add_itemzDataset.add_itemc  sy   2 #./P/P4::<</P/P/PQQ
'6Z ("<Z=N"O"OP(
 (
$} ?Cz?RVc?c?c
 :;;;imis :;;
 
 =  MM!#3tz??*;")++!N!N!N!.!:<N;OXaWb!c!c!c)4=:L*MNNMy~~]+++-eT]CC*''
 
 
 	
rv   label2idlabel_columnc                 (   | j         j        vr t          d d| j         j         d          | j        j                 }t          |t                    s~t          |t                    rt          |j        t                    sOt          dt          j	         dt          j	         dt          j	         d| dt          |          j	         d	          t          t                                          d
                     t                                                    }d                                 D             t          |t                    r|j        n|j        j        t          |t                    rfd}nfd}| j        }t          |t                    rt          t#          |          |          n*t          t          t#          |          |                    |<   |                     ||dd          S )a[  Align the dataset's label ID and label name mapping to match an input `label2id` mapping.
        This is useful when you want to ensure that a model's predicted labels are aligned with the dataset.
        The alignment in done using the lowercase label names.

        Args:
            label2id (`dict`):
                The label name to ID mapping to align the dataset with.
            label_column (`str`):
                The column name of labels to align on.

        Example:

        ```python
        >>> # dataset with mapping {'entailment': 0, 'neutral': 1, 'contradiction': 2}
        >>> ds = load_dataset("glue", "mnli", split="train")
        >>> # mapping to align with
        >>> label2id = {'CONTRADICTION': 0, 'NEUTRAL': 1, 'ENTAILMENT': 2}
        >>> ds_aligned = ds.align_labels_with_mapping(label2id, "label")
        ```

        r|  r}  r~  z5Aligning labels with a mapping is only supported for z column or z column with the inner type z, and column z is of type rg  c                     | d         S )Nr   r   )r1  s    rt   r  z3Dataset.align_labels_with_mapping.<locals>.<lambda>  s
    $q' rv   )r   c                 >    i | ]\  }}|                                 |S r   rc  rh  s      rt   r   z5Dataset.align_labels_with_mapping.<locals>.<dictcomp>  s&    >>>TQAGGIIq>>>rv   c                 R    fd|          D             }fd|D             | <   | S )Nc                 R    g | ]#}| |                                           nd $S rp   r  r   label_idint2str_functions     rt   r   zPDataset.align_labels_with_mapping.<locals>.process_label_ids.<locals>.<listcomp>  sM     $ $ $  ;C:N$$X..44666TX$ $ $rv   c                 (    g | ]}||         nd S rp   r   r   
label_namer  s     rt   r   zPDataset.align_labels_with_mapping.<locals>.process_label_ids.<locals>.<listcomp>  s5     ' ' 'Q[J,BHZ((' ' 'rv   r   r   dset_label_namesr  r  r  s     rt   process_label_idsz<Dataset.align_labels_with_mapping.<locals>.process_label_ids  se    $ $ $ $$),$7$ $ $ ' ' ' '_o' ' 'l# rv   c                 R    fd|          D             }fd|D             | <   | S )Nc                 ,    g | ]}fd |D             S )c                 R    g | ]#}| |                                           nd $S rp   r  r  s     rt   r   z[Dataset.align_labels_with_mapping.<locals>.process_label_ids.<locals>.<listcomp>.<listcomp>  s=    nnn^f8;O%%h//55777UYnnnrv   r   )r   seqr  s     rt   r   zPDataset.align_labels_with_mapping.<locals>.process_label_ids.<locals>.<listcomp>  s?     $ $ $ onnnjmnnn$ $ $rv   c                 ,    g | ]}fd |D             S )c                 (    g | ]}||         nd S rp   r   r  s     rt   r   z[Dataset.align_labels_with_mapping.<locals>.process_label_ids.<locals>.<listcomp>.<listcomp>  s)    dddR\Z-CXj))dddrv   r   )r   r  r  s     rt   r   zPDataset.align_labels_with_mapping.<locals>.process_label_ids.<locals>.<listcomp>  s?     ' ' ' eddd`cddd' ' 'rv   r   r  s     rt   r  z<Dataset.align_labels_with_mapping.<locals>.process_label_ids  se    $ $ $ $$\2$ $ $ ' ' ' '/' ' 'l# rv   )num_classesr  TzAligning the labels)r   r  r  )r  r3  r   rq   r   r   r'   r   r  r   r   r   r8  r   r   r   int2strr   r"  )rs   r  r  label_featurelabel_namesr  r   r  s    ``    @rt   align_labels_with_mappingz!Dataset.align_labels_with_mapping  s   . tz666iiidjNeiiijjj
+L9}j11	=(33	8B=CXZd8e8e	  t
H[  t  thphy  t  t  Xb  Xk  t  t  zG  t  t  UY  Zg  Uh  Uh  Uq  t  t  t  
 x~~//5I5IJJJKK8==??++>>X^^-=-=>>>%/z%J%JmM!!P]PePm 	 mZ00 	       	 	 	 	 	 	 	 = -44WJ3{#3#3;GGGG*[1A1AUUUVV 	
 xx)HdQfxgggrv   )NNNN)NNNFr  )NNNFN)NNFNN)NNNFNN)NNNT)NNF)r  NNNN)r  NN)F)Nr  )r  FNNr  Nrp   )NFr   )NFFNFr  FNFNNr  NFNNr+  NN)NFFNFr  FNFNr  NFNNNr   )NFNFr  FNNr  NNr+  NN)FNr  NFNN)FNr  N)Fr  r  FNNr  N)NNFNNr  N)NNTNNNFNNNr  NN)FFNr  )r  )NN)Nr  ro  )NFNNNNT)NNNNNN)r   r   r   r   rN   r   rD   rH   r   ru   r   r(   r   classmethodr   r  rh  Bufferr  ry  	DataFramer  r   r  r   r  r*  r   rf   r   r  r   r  r  r  r  r  r  r  r  r  rB  r)  r   rX  ro  r#  r$  ru  rw  r3  r   r   r  r  r3   r  r  r+   r  rT  rF  r  r  r   r  r  r  r  rH  r  r   r  contextmanagerr  r:  r  r  r!  r  rV   r  r  r  r   r   r  r  r'  r*  r"  ra  r  r  r  r  r  r  	Sequence_r  r   r  r  r   r  r  r	  rC  r   rI  r   rO  r   rQ  rU  rZ  r^  rb  r  ru  r}  r  r  r   r  r   r  r  r  r  r  __classcell__)r  s   @rt   r   r     s"       --
 '+&*)-%)BT BTBT {#BT 
#	BT
  BT c]BT BT BT BTH (      X  '+&**.%
 %
%
 {#%
 
#	%

 #3-%
 %
 
%
 %
 %
 [%
N  '+&*.2O O	O {#O 
#	O
 !+O 
O O O [O>  (,&*&*)-;2 ;2L;2 8$;2 {#	;2
 
#;2 !;2 
;2 ;2 ;2 [;2z  (,&*&*55 5555 8$55 {#	55
 
#55 
55 55 55 [55n  (,&*&*= =d= 8$= {#	=
 
#= 
= = = [=4  '+'+$"&2 2XtH~562
#2 8$2 	2
 2 3-2 2 2 \2h  (,$%)"&A AA8$A A 	A
 TNA 3-A A A \AF  '+'+$#"&6 6XtH~566
#6 8$6 	6
 6 }6 3-6 6 6 \6p  '+'+$'+"&8 8XtH~568
#8 8$8 	8
 8 $s)$8 3-8 8 8 \8t  '+'+$"&2 2XtH~562
#2 8$2 	2
 2 3-2 2 2 \2h  '+'+%)1 1#1
#1 8$1 	1
 #1 1 1 \1f  (,$7 733347368RThhi7 8$7 	7
 7 7 7 \7r       48$("&*.nM nMnM !sCx1	nM
 SMnM 3-nM "$nM nM nM nM` 6S 6 63 6YabfYg 6 6 6 \64 TC TD T T T \T   )-*.	J JJ !J "$	J
 
J J J \JX e    X& PT$Z P P P XP" &S & & & X& ## # # # X#  'd3i ' ' ' X'  uS#X       X  AS AT A A A A>I I# Id Iy I I I IV 5)))) )x} )i ) ) ) *))\ %)$/3)-+/"&Q QQ SMQ 	Q
 'tnQ "#Q $C=Q 3-Q 
Q Q Q Qf 5)))'' ''# '' ''hWZm ''gp '' '' '' *)''R 5)))3 35d3i+@ 3S[\_S` 3lu 3 3 3 *) _ ^3j 5)))`d? ?$'?:=?PXY\P]?	? ? ? *) ^?B 5)))@ @T#s(^ @hWZm @gp @ @ @ *) ^@D 5))), ,5d3i+@ ,S[\_S` ,lu , , , *) _ ^,\  "  : s T    @t t t 
 
 X
  #"&#(	n nsmn $n !	n n n n> 4((( #"&#(	J
 J
smJ
 $J
 !	J
 J
 J
 )(J
X  @ #'#(	)o )oH%)o $)o !	)o )o )o )oZ #"&#(	1 1sm1 $1 !	1 1 1 1l #'#(	. .H%. $. !	. . . .`0 0U3+<%= 03 0y 0 0 0 0d E#uc/2  tTzAR        $ uS%#%>? D    X s t    X" " "\ \$ \ \ \ \#$S #$ #$ #$ #$J    (,"9=$( %:>$/3)-+/'+!&$("&>)-")f f8$f f 	f
  c49n 56f f SMf f !sDI~!67f f 'tnf "#f $C=f 8$f f  D>!f" 3-#f$ %f& "#'f( sm)f* 
+f f f _ ^fP  (,"-1$( %.2$)-+/'+!&$()-"%s$ s$s$8$s$ s$ 	s$
  S	*s$ s$ SMs$ s$ !c+s$ s$ "#s$ $C=s$ 8$s$ s$ D>s$  "#!s$" sm#s$$ %s$& 
%T5i#889	:'s$ s$ s$ \s$j	 %X%X%Xbi  
 (,9=$($/3)-+/$("&>)-"r r8$r  c49n 56	r
 r SMr r 'tnr "#r $C=r D>r 3-r r "#r smr  
!r r r  _rh 59J8KLLL  %)-+/'+!&"&)-,
 ,
,
 "#,
 $C=	,

 8$,
 ,
 3-,
 "#,
 
,
 ,
 ,
 ML _,
` 26.2%)	
 
!)#
 !+
 c]	

 

 
 
 
> 59R8STTT  %15+/)-U
 U
U
 U
 "*#	U

 $C=U
 "#U
 
U
 U
 U
 UT _U
n 5)))
 *.	4 44 4 "#	4
 
4 4 4 *) _4l 59R8STTT  %15+/)-eu eueu eu "*#	eu
 $C=eu "#eu 
eu eu eu UT _euN 59OQj8klll 16&$/315+/)-M
 M
C3/0M
 tYt_,-M

 M
 M
 'tnM
 "*#M
 $C=M
 "#M
 
M
 M
 M
 ml _M
^ 4@VXq?r  
 #37$/315+/)-~
 ~
sm~
 BI/0~
 	~

 'tn~
 "*#~
 $C=~
 "#~
 
~
 ~
 ~
  _~
@  24JKooo	   .2.2,0"37$/37;6:+//3.2RG RGT)*RG %d*+RG 	RG
 %SMRG smRG BI/0RG RG 'tnRG (0}RG '/smRG $C=RG  (}RG 'smRG 
RG RG RG  _RGp !$15+/J
 J
J
 J
 	J

 J
 "*#J
 $C=J
 
J
 J
 J
 J
^ !W WW W W W Wx %)"&	,v ,v8X-.,v SM,v 3-	,v 
,v ,v ,v ,v\1 1(3- 1QVW[]efj]kWkQl 1 1 1 1f    * %)"&	+x +x8X-.+x SM+x 3-	+x 
+x +x +x +x\ AF% %"3-%9=%	r|Xbl33	4% % % %T %)o o8X-.o SMo
 
o o o oF %)	/e /e/e 368RThhi/e SM	/e 
/e /e /e /eb#    @ 	%tI 	% 	% 	% \	%rV rVhsm rVDU rV rV rV rVn  $"'# $48$(%)GW GWGW }GW $	GW
 }GW GW !sCx1GW SMGW #GW 
sCc!	"GW GW GW GWX  $"'# $48$(%)c
 c
c
 }c
 $	c

 }c
 c
 !sCx1c
 SMc
 #c
 c
 c
 c
J 5)))$ls $lE$.,A $lTW $l $l $l *) _$lR %) $(,%)04$(#jN NN SMN 	N
 !N c]N }-N N SMN N N N N N Nh !%(,%)04$(#j9
 9
9
 9
 	9

 !9
 c]9
 }-9
 9
 SM9
 9
 9
 9
 9
 9
 9
| %)""=A'+*.B BB SMB sm	B
 smB 9:B  }B "$B B B B B BH 5)))2
T 2
C 2
 2
 2
 *) _2
hKh$ Khc Khi Kh Kh Kh Kh Kh Kh Kh Khrv   r   dsetsrm   rn   r  c           	          t          d  D                       rd  D              n d         S |dk    rt          d  D                        nCt           fd D                       st          d          t	          d  D                         d         j        t          fd D                       ri t                              d	           d
 }t          d  D                       r|dk    r9g }t          t                               D ]m} |         j
        > |                             t          t           |                                        |<   |                     |         j
                   nd}t          t                               D ]4} |||         |          ||<   |t           |         j                  z  }5d |D             }|rt          |          }nt          j        g t#          j        dt#          j                    i                    }nbt                     dk    r d         j
        }nAt          t                               D ]} |                                          |<    d}nd}t          d  D             |          }	|dk    rt+          d  D                       }
nd  D             }
t-          |	d |
D                       }	|t/          j        d  D                       }t3          d                    d  D                       t6          ||d          }t9          |	||||          } |j        di  |S )a  
    Converts a list of :class:`Dataset` with the same schema into a single :class:`Dataset`.
    When you concatenate on axis 0, missing data are filled with None values.

    Args:
        dsets (`List[datasets.Dataset]`): List of Datasets to concatenate.
        info (:class:`DatasetInfo`, optional): Dataset information, like description, citation, etc.
        split (:class:`NamedSplit`, optional): Name of the dataset split.
        axis (``{0, 1}``, default ``0``, meaning over rows):
            Axis to concatenate over, where ``0`` means over rows (vertically) and ``1`` means over columns
            (horizontally).

            *New in version 1.6.0*

    Example:

    ```py
    >>> ds3 = _concatenate_map_style_datasets([ds1, ds2])
    ```
    c              3   ,   K   | ]}|j         d k    V  dS r   Nr  r   r  s     rt   rP  z2_concatenate_map_style_datasets.<locals>.<genexpr>  s)      
/
/4=1
/
/
/
/
/
/rv   c                 (    g | ]}|j         d k    |S r  r  r  s     rt   r   z3_concatenate_map_style_datasets.<locals>.<listcomp>  s$    ===$4=1+<+<+<+<+<rv   r   c                     g | ]	}|j         
S r   rZ  r  s     rt   r   z3_concatenate_map_style_datasets.<locals>.<listcomp>
  s    *K*K*KT4=*K*K*Krv   c              3   D   K   | ]}|j         d          j         k    V  dS r  r  )r   r  r  s     rt   rP  z2_concatenate_map_style_datasets.<locals>.<genexpr>  s1      HH$4=E!H$55HHHHHHrv   z*Number of rows must match for all datasetsc                 0    g | ]}|j         j        D ]}|S r   ry  )r   r  rX  s      rt   r   z3_concatenate_map_style_datasets.<locals>.<listcomp>  s*    ```$
H_``HX````rv   c              3   .   K   | ]}|j         k    V  d S rp   )r   )r   r  r   s     rt   rP  z2_concatenate_map_style_datasets.<locals>.<genexpr>  s*      
3
3T4;& 
3
3
3
3
3
3rv   z]Some of the datasets have disparate format. Resetting the format of the concatenated dataset.c                     |dk    r| S | d         }t          j        |t          j        |t          j                                        }t          j        |gdg          S )Nr   r   rx  r  )r  r&  rh  scalarr  rL   r  )rU  rX  r   	new_arrays       rt   apply_offset_to_indices_tablezF_concatenate_map_style_datasets.<locals>.apply_offset_to_indices_table  sZ    Q;;L)$EubiRY[[&I&I&IJJI ,i[LLLLrv   c              3   (   K   | ]}|j         d uV  d S rp   )r  r  s     rt   rP  z2_concatenate_map_style_datasets.<locals>.<genexpr>  s)      
7
74=$
7
7
7
7
7
7rv   Nc                 8    g | ]}t          |          d k    |S r  rV  )r   ts     rt   r   z3_concatenate_map_style_datasets.<locals>.<listcomp>0  s#    FFFA3q66A::a:::rv   r   r  r   c                     g | ]	}|j         
S r   rq  r  s     rt   r   z3_concatenate_map_style_datasets.<locals>.<listcomp>?  s    888$4:888rv   r  c                     g | ]	}|j         
S r   rZ  r  s     rt   r   z3_concatenate_map_style_datasets.<locals>.<listcomp>A  s    (I(I(I4(I(I(Irv   c                     g | ]	}|j         
S r   rZ  r  s     rt   r   z3_concatenate_map_style_datasets.<locals>.<listcomp>C  s    9994999rv   c                 H    i | ]}|                                 D ]\  }}||	 S r   r   )r   r   rO  r  s       rt   r   z3_concatenate_map_style_datasets.<locals>.<dictcomp>D  s;    1r1r1r8aiaoaoaqaq1r1rY]YZ\]!Q1r1r1r1rrv   c                     g | ]	}|j         
S r   ry  r  s     rt   r   z3_concatenate_map_style_datasets.<locals>.<listcomp>H  s    &C&C&CTty&C&C&Crv   rN  c              3   $   K   | ]}|j         V  d S rp   r_  r  s     rt   rP  z2_concatenate_map_style_datasets.<locals>.<genexpr>J  s%      44d!444444rv   r~  r  r   )r  r-   rQ  r   rr  r   r  rm   r   r   r  r  r   r  rP   rL   from_batchesrh  r\  r   r  r,   re  rD   
from_merger;   r   rd  r   r:  )r  rm   rn   r  r!  indices_tablesr   rX  r|  rU  features_listrC  concatenated_datasetr   s   `            @rt   rd  rd    s   6 
/
/
/
/
/// ==%=== Qx qyy)*K*KU*K*K*KLLLLHHHH%HHHHH 	KIJJJ``%```aaa 1X_F

3
3
3
3U
3
3
333 ustttM M M 
7
7
7
7
777 199  N3u::&& 9 98$,$QxDDU3uUVx==EYEYZZE!H%%eAh&78888 F3u::&& . .$A$A.QRBSU[$\$\q!#eAhn--- GFFFFN j -n = = - :2biQZ\^\d\f\fPgFhFh i i i5zzQ %a 1s5zz** : :A$Qx7799E!HH $88%888tDDDEqyy'(I(I5(I(I(IJJ995999)%1r1r}1r1r1rssE |%&C&CU&C&C&CDD$
44e444446U`dotWuWu K
 ##   $#--f---rv   first_exhaustedr@  probabilitiesr  stopping_strategyc                     |dvr't          | dt           d                              t           ||          }d  D             }t          j        dg|dd         z             }	|dk    }
t|
sr|	                    d	d          t          j        t          |                                        dd	          z                                   	                                }n\t          j
        t          j        t          |                                        dd	          t          j        |                              d	d                    }||	z                                   	                                }nt          j        t          |          d
          }|
rt          j        nt          j        } fd}dgt                     z  }g } |            D ]_} ||          r nQ|                    ||         |	|         z              ||xx         d	z  cc<   ||         ||         k    r
d||<   d||<   ` |j        |fi |S )a!  
    Interleave several map-style datasets (sources) into a single map-style dataset.
    The new dataset is constructed by alternating between the sources to get the examples.
    If `probabilities = None` (default) the new dataset is constructed by cycling between each source to get the examples.
    If `probabilities` is not `None, the new dataset is constructed by getting examples from a random source at a time according to the provided probabilities.

    Args:
        datasets (`List[Dataset]`): list of datasets to interleave
        probabilities (`List[float]`, optional, default None): If specified, the new dataset is constructed by sampling
            examples from one source at a time according to these probabilities.
        seed (`int`, optional, default None): The random seed used to choose a source for each example.
        info (:class:`DatasetInfo`, optional): Dataset information, like description, citation, etc.
        split (:class:`NamedSplit`, optional): Name of the dataset split.
        stopping_strategy (Optional `str`, defaults to `first_exhausted`):
            Two strategies are proposed right now.
            By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.
            If the strategy is `all_exhausted`,  we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once.
            Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:
            - with no probabilities, the resulting dataset will have max_length_datasets*nb_dataset samples.
            - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.
        **kwargs (additional keyword arguments): Keyword arguments to be passed to :meth:`datasets.Datasets.select` when selecting the indices used to interleave the datasets.

    Output:
        :class:`datasets.Dataset`
    )r0  all_exhaustedzR stopping strategy in `interleave_datasets` is not implemented yet with a list of r   r~  c                 ,    g | ]}t          |          S r   rV  r  s     rt   r   z2_interleave_map_style_datasets.<locals>.<listcomp>  s    ...Ts4yy...rv   Nr  r4  r   Fc               3      K   t           j                                      } 	 d |                     t	                    d          D             E d{V  6)z]Get an infinite iterator that randomly samples the index of the source to pick examples from.Tc              3   4   K   | ]}t          |          V  d S rp   )r   )r   r   s     rt   rP  zN_interleave_map_style_datasets.<locals>.iter_random_indices.<locals>.<genexpr>  s(      bbqCFFbbbbbbrv   r  )rv  pN)r   r  r  choicer   )r  r@  r1  r  s    rt   iter_random_indicesz;_interleave_map_style_datasets.<locals>.iter_random_indices  sg      )''--CcbbCJJs8}}4S`J,a,abbbbbbbbbbcrv   T)r   r   rd  r   cumsumr<  r  r   r  tolistr  r  r   fullr   rQ  r  r   r  )r@  r1  r  rm   rn   r2  r<  concatenated_datasetslengthsoffsetsoversamplingr   is_exhaustedbool_strategy_funcr:  current_index
source_idxs   ```              rt   _interleave_map_style_datasetsrF  Y  s   D  DDD   H  Htx  zB  CD  zE  uF  uF  H  H
 
 	

 <H4W\]]] /.X...Gigcrcl*++G %7L\ ??1b))BIc'll,C,C,K,KBPQ,R,RR[[]]ddff		 &3w<<0088Q??'ARARAZAZ[\^`AaAabb W$--//6688 ws7||U33 (4?RVV	c 	c 	c 	c 	c 	c 	c c(mm+--// 	. 	.J "!,//  NN=4wz7JJKKK*%%%*%%% Z(GJ,???+/Z(,-j)' '::6:::rv   r   r@  
world_sizec                 2    |                      ||d          S )aD  
    Split a dataset for the node at rank `rank` in a pool of nodes of size `world_size`.
    Each node is assigned a chunk of data, e.g. rank 0 is given the first chunk of the dataset.
    To maximize data loading throughput, chunks are made of contiguous data on disk if possible.

    Args:
        dataset ([`Dataset`]):
            The dataset to split by node.
        rank (`int`):
            Rank of the current node.
        world_size (`int`):
            Total number of nodes.

    Returns:
        [`Dataset`]: The dataset to be used on the node at rank `rank`.
    Tr  r  )r   r@  rG  s      rt    _split_by_node_map_style_datasetrI    s    " ==Jdt=LLLrv   r,  r  r-  r/  indices_mappingc           	      b   |r |^ }}|r | g ||R i |}	n | |i |}	n	|^ }}g }	||d         t          t          t                                                                                 }
t	          |
          D ]?fdD             }|	                    |r | ||         fi |n | |fi |           @ng|}t          |d                   }
t	          |
          D ]@fd|D             }|	                    |r | g ||         R i |n | |i |           Ad t          ||	          D             }|ct          j        |t          j	                              }|
                    d                              |          }|                                }d|iS )Nr   c                 .    i | ]}||                  S r   r   )r   r   r   r   s     rt   r   z2get_indices_from_mask_function.<locals>.<dictcomp>  s#    ???#3c
1???rv   c                      g | ]
}|         S r   r   )r   r   r   s     rt   r   z2get_indices_from_mask_function.<locals>.<listcomp>  s    999v999rv   c                     g | ]	\  }}||
S r   r   )r   r   to_keeps      rt   r   z2get_indices_from_mask_function.<locals>.<listcomp>  s!    GGG:1gwGQGGGrv   rx  r   )r   r  rH  r   r   r   re  rh  r   r  r   r  r  )r,  r  r-  r/  rJ  r;  r1  r  r   maskrP  r  r   inputr  r   r   s                  @@rt   r  r    sQ      	28:V:W:::	::DD8V1y11DD    )EuT$uzz||*<*<%=%=>??L<((  ????????BNrHHWgaj>>I>>>T\T\]dTrTrhqTrTr    #)Gwqz??L<((  9999999AMpHH=e=WQZ===9===S[S[]bSpfoSpSp    HGWd););GGGM"RY[[AAA'..q1166}EE%//11}%%rv   )NNr   )NNNNr0  rp   )r   r  r   r  r_  r  r!  r  r  r  r  rG  r  r'  collectionsr   collections.abcr   r   	functoolsr   r   ior   mathr	   r
   pathlibr   r  r   typingr   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r   r   pandasry  pyarrowrh  pyarrow.computecomputer  huggingface_hubr   r   multiprocessr   requestsr   rN  r    arrow_readerr!   arrow_writerr"   r#   download.download_configr$   #download.streaming_download_managerr%   r   r&   r'   r(   r)   r*   features.featuresr+   r,   r-   r.   r/   r0   filesystemsr1   r2   rC  r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   
formattingr=   r>   r?   r@   rA   formatting.formattingrB   rC   rm   rD   rE   namingrF   searchrG   r  rH   rI   rJ   rK   rU  rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   r  rV   utilsrW   utils.file_utilsrX   rY   rZ   	utils.hubr[   utils.info_utilsr\   utils.metadatar]   utils.py_utilsr^   r_   r`   ra   utils.stratifyrb   utils.tf_utilsrc   rd   re   utils.typingrf   rg   r   typing_extensionssqlite3pyspark
sqlalchemyr  ri   rz  rj   
get_loggerr   r  rl   r   r  r,  rF  rT  re  rj  r   rr  rw  ry  r   r   rd  r  rF  rI  r   r  r   rv   rt   <module>rx     s	    / .           				     				  



           # # # # # #       $ $ $ $ $ $ $ $                                                      ) ( ( ( ( (                    + + + + + + + +                   % % % % % % = = = = = = = = 4 4 4 4 4 4 9 9 9 9 9 9 I I I I I I I I I I I I I I I I                E D D D D D D D                        n m m m m m m m m m m m m m A A A A A A A A / / / / / / / /       " " " " " " ; ; ; ; ; ; ; ; ; ; ; ;                                     H H H H H H H H H H ! ! ! ! ! ! . . . . . . + + + + + + _ _ _ _ _ _ _ _ _ _ _ _ E E E E E E \ \ \ \ \ \ \ \ \ \ " " " " " "* * * *))))))))*  2NNNNNN))))))111111		H	%	%I" I" I" I" I" I" I" I"Xo o o o o o o od		 	 	 	 	9 	 	 	( ( (V  < (     	h5 	h 	h 	h 	hvd3i v v v vT T T
	 	 	 	 	i 	 	 	`Qh `Qh `Qh `Qh `Qh0F `Qh `Qh `QhJc #'"&	o  o =o 
;
o  Jo  	o  o  o  o h ,0"&"&'8g; g;9og;DK(g; 3-g; ;
	g;
 Jg;  }g; g; g; g; g;TMg MS Mc MV] M M M M8 (,,& ,&,&,& ,& E#tCy.12	,&
 e_,& ,& ,& ,& ,& ,&s   0F7 7GG