
    +gd"                       d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ ddlZddlZddlmZ ddlmZ d	d
l m!Z!m"Z" d	dl#m$Z$ d	dl%m&Z&m'Z'm(Z(m)Z)m*Z* d	dl+m,Z,m-Z-m.Z.m/Z/ d	dl0m1Z1m2Z2 d	dl3m4Z4m5Z5 d	dl6m7Z7 d	dl8m9Z9m:Z: d	dl;m<Z< d	dl=m>Z>m?Z? d	dl@mAZA d	dlBmCZCmDZD d	dlEmFZF d	dlGmHZHmIZImJZJ d	dlKmLZLmMZMmNZN d	dlOmPZP d	dlQmRZRmSZS d	dlTmUZUmVZVmWZWmXZX d	dlYmZZZ d	dl"m[Z[ d	dl\m]Z]m^Z^ d	dl_m`Z` d	d lambZbmcZcmdZdmeZe d	d!lfmgZgmhZhmiZimjZjmkZkmlZlmmZmmnZn d	d"lompZpmqZq  e[jr        es          Zt G d# d$eu          Zv G d% d&ew          Zx G d' d(ex          Zy G d) d*ex          Zze G d+ d,                      Z{ G d- d.          Z| G d/ d0e|          Z} G d1 d2e|          Z~ G d3 d4eu          Z G d5 d6e|          ZdS )7zDatasetBuilder base class.    N)	dataclass)partial)Path)DictIterableMappingOptionalTupleUnion)Pool)
thread_map   )configutils)Dataset)HF_GCP_BASE_URLArrowReaderDatasetNotOnHfGcsErrorMissingFilesOnHfGcsErrorReadInstruction)ArrowWriter
BeamWriterParquetWriterSchemaInferenceError)DataFilesDictsanitize_patterns)DatasetDictIterableDatasetDict)DownloadConfig)DownloadManagerDownloadMode)MockDownloadManager)StreamingDownloadManagerxopen)Features)is_remote_filesystemrename)Hasher)DatasetInfoDatasetInfosDictPostProcessedInfo)ExamplesIterableIterableDataset&_generate_examples_from_tables_wrapper)DuplicatedKeysError)"INVALID_WINDOWS_CHARACTERS_IN_PATHcamelcase_to_snakecase)Split	SplitDictSplitGenerator	SplitInfo)$extend_dataset_builder_for_streaming)logging)cached_pathis_remote_url)FileLock)VerificationModeget_size_checksum_dictverify_checksumsverify_splits)classpropertyconvert_file_size_to_inthas_sufficient_disk_spaceiflatmap_unordered
map_nestedmemoizesize_strtemporary_assignment)_number_of_shards_in_gen_kwargs_split_gen_kwargsc                       e Zd ZdS )InvalidConfigNameN__name__
__module____qualname__     0lib/python3.11/site-packages/datasets/builder.pyrJ   rJ   W           DrP   rJ   c                       e Zd ZdS )DatasetBuildErrorNrK   rO   rP   rQ   rT   rT   [   rR   rP   rT   c                       e Zd ZdS )ManualDownloadErrorNrK   rO   rP   rQ   rV   rV   _   rR   rP   rV   c                       e Zd ZdS )DatasetGenerationErrorNrK   rO   rP   rQ   rX   rX   c   rR   rP   rX   c                       e Zd ZU dZdZeed<    ej        d          Z	e
eej        ef                  ed<   dZe
e         ed<   dZe
e         ed<   dZe
e         ed	<   d
 Zd Z	 ddede
e         defdZdS )BuilderConfiga  Base class for `DatasetBuilder` data configuration.

    `DatasetBuilder` subclasses with data configuration options should subclass
    `BuilderConfig` and add their own properties.

    Attributes:
        name (`str`, defaults to `default`):
        version (`Version` or `str`, *optional*):
        data_dir (`str`, *optional*):
        data_files (`str` or `Sequence` or `Mapping`, *optional*):
            Path(s) to source data file(s).
        description (`str`, *optional*):
    defaultnamez0.0.0versionNdata_dir
data_filesdescriptionc                     t           D ]+}|| j        v r t          dt            d| j         d          ,| j        1t	          | j        t
                    st          d| j                   d S d S )Nz Bad characters from black list 'z' found in 'z\'. They could create issues when creating a directory for this config on Windows filesystem.z/Expected a DataFilesDict in data_files but got )r0   r\   rJ   r_   
isinstancer   
ValueError)selfinvalid_chars     rQ   __post_init__zBuilderConfig.__post_init__}   s    > 	 	Lty(('q7Y q qgkgp q q q   )
 ?&z$/=/Y/Y&`t``aaa '&&&rP   c                     t           j                                                  t          j                                                  k    rdS t           fd j                                        D                       S )NFc              3   d   K   | ]*}|t          |          f|t          |          fk    V  +d S N)getattr).0kord   s     rQ   	<genexpr>z'BuilderConfig.__eq__.<locals>.<genexpr>   sD      ]]1AwtQ''(Q1,>>]]]]]]rP   )set__dict__keysall)rd   rm   s   ``rQ   __eq__zBuilderConfig.__eq__   st     t}!!##$$AJOO,=,=(>(>>>5]]]]]HZHZH\H\]]]]]]rP   config_kwargscustom_featuresreturnc                 ~   d}|                                                     dd                               dd           dv rd                             dd           rfdt                    D             t          d                                 D                       rYd                    d                                 D                       }t          |          d	k    rt          j	                  }nt          j	                  }|Nt                      }|r|
                    |           |
                    |           |                                }|rK| j        d
z   |z   }t          |          t          j        k    r| j        d
z   t          j	        |          z   }|S | j        S )a0  
        The config id is used to build the cache directory.
        By default it is equal to the config name.
        However the name of a config is not sufficient to have a unique identifier for the dataset being generated
        since it doesn't take into account:
        - the config kwargs that can be used to overwrite attributes
        - the custom features used to write the dataset
        - the data_files for json/text/csv/pandas datasets

        Therefore the config id is just the config name with an optional suffix based on these.
        Nr\   r]   r^   c                 "    i | ]}||         S rO   rO   )rk   rl   config_kwargs_to_add_to_suffixs     rQ   
<dictcomp>z2BuilderConfig.create_config_id.<locals>.<dictcomp>   s.     . . .9:1!4. . .rP   c              3   f   K   | ],}t          |t          t          t          t          f          V  -d S ri   )rb   strboolintfloat)rk   vs     rQ   rn   z1BuilderConfig.create_config_id.<locals>.<genexpr>   s5      kka:a#tS%!899kkkkkkrP   ,c              3      K   | ]F\  }}t          |          d z   t          j                            t          |                    z   V  GdS )=N)r|   urllibparse
quote_plus)rk   rl   r   s      rQ   rn   z1BuilderConfig.create_config_id.<locals>.<genexpr>   s\       " "GKq!CFFSL6<#:#:3q66#B#BB" " " " " "rP       -)copypopsortedrr   valuesjoinitemslenr(   hashupdate	hexdigestr\   r   %MAX_DATASET_CONFIG_ID_READABLE_LENGTH)rd   rt   ru   suffixm	config_idry   s         @rQ   create_config_idzBuilderConfig.create_config_id   s   " !%)6););)=)=&&**64888&**9d;;;
 777<Z[e<f<n*..z4@@@) 	E. . . .>DEc>d>d. . .* kkCaChChCjCjkkkkk E " "OmOsOsOuOu" " "   v;;###[)GHHF%CDD&A !   HH_%%%[[]]F 		C&0I9~~ LLL IOfk&.A.AA	9rP   ri   )rL   rM   rN   __doc__r\   r|   __annotations__r   Versionr]   r	   r   r^   r_   r   r`   rf   rs   dictr%   r   rO   rP   rQ   rZ   rZ   g   s           D#3@5=3I3IGXeEM3./0III"Hhsm"""*.J'...!%K#%%%	b 	b 	b^ ^ ^ /37 77 "(+7 
	7 7 7 7 7 7rP   rZ   c                      e Zd ZdZdZeZg ZdZdZ		 	 	 	 	 	 	 	 	 	 	 	 	 dIde
e         de
e         de
e         de
e         de
e         d	e
e         d
e
eeef                  de
e         de
eeeeef                  de
e         de
e         de
e         fdZd Zd Zede
e         fd            Zedefd            ZdefdZ	 dJdeeef         fdZee e             d                                     Z!ed             Z"dKdefdZ#d Z$e%j&        defd            Z'ed             Z(ded efd!Z)	 	 	 	 	 	 	 	 	 	 	 	 	 dLd#e
e         d$e
e*         d%e
ee+ef                  d&e
ee,ef                  d'ed(e
e-         de
e         d)ed*e
eeef                  d+e
e         de
e         fd,Z.d- Z/d$e*fd.Z0d/ Z1d0 Z2defd1Z3d2 Z4d3 Z5d4 Z6	 	 	 	 	 dMd6e
e7         d&e
ee,ef                  dee8e9f         fd7Z:	 dNd6eee;e7f         d8ed&e,d9efd:Z<e7j=        d5fd6ee;e7f         d9ede8fd;Z>d6ee;e7f         defd<Z?	 	 dJd6e
e         de
e         dee@eeAf         eAf         fd=ZBdeAfd>ZCd?e8d@eDeef         de
e8         fdAZEd6ede@eef         fdBZFd6edCed(e-de
e         fdDZGe%j&        d(e-fdE            ZHe%j&        	 	 	 dOdFeId)ed*e
eeef                  d+e
e         fdG            ZJdFeIdeKfdHZLdS )PDatasetBuildera  Abstract base class for all datasets.

    `DatasetBuilder` has 3 key methods:

        - [`DatasetBuilder.info`]: Documents the dataset, including feature
          names, types, shapes, version, splits, citation, etc.
        - [`DatasetBuilder.download_and_prepare`]: Downloads the source data
          and writes it to disk.
        - [`DatasetBuilder.as_dataset`]: Generates a [`Dataset`].

    Some `DatasetBuilder`s expose multiple variants of the
    dataset by defining a [`BuilderConfig`] subclass and accepting a
    config object (or name) on construction. Configurable datasets expose a
    pre-defined set of configurations in [`DatasetBuilder.builder_configs`].

    Args:
        cache_dir (`str`, *optional*):
            Directory to cache data. Defaults to `"~/.cache/huggingface/datasets"`.
        config_name (`str`, *optional*):
            Name of the dataset configuration.
            It affects the data generated on disk. Different configurations will have their own subdirectories and
            versions.
            If not provided, the default configuration is used (if it exists).

            <Added version="2.3.0">

            Parameter `name` was renamed to `config_name`.

            </Added>
        hash (`str`, *optional*):
            Hash specific to the dataset code. Used to update the caching directory when the
            dataset loading script code is updated (to avoid reusing old data).
            The typical caching directory (defined in `self._relative_data_dir`) is `name/version/hash/`.
        base_path (`str`, *optional*):
            Base path for relative paths that are used to download files.
            This can be a remote URL.
        features ([`Features`], *optional*):
            Features types to use with this dataset.
            It can be used to change the [`Features`] types of a dataset, for example.
        use_auth_token (`str` or `bool`, *optional*):
            String or boolean to use as Bearer token for remote files on the
            Datasets Hub. If `True`, will get token from `"~/.huggingface"`.
        repo_id (`str`, *optional*):
            ID of the dataset repository.
            Used to distinguish builders with the same name but not coming from the same namespace, for example "squad"
            and "lhoestq/squad" repo IDs. In the latter, the builder name would be "lhoestq___squad".
        data_files (`str` or `Sequence` or `Mapping`, *optional*):
            Path(s) to source data file(s).
            For builders like "csv" or "json" that need the user to specify data files. They can be either
            local or remote files. For convenience, you can use a `DataFilesDict`.
        data_dir (`str`, *optional*):
            Path to directory containing source data file(s).
            Use only if `data_files` is not passed, in which case it is equivalent to passing
            `os.path.join(data_dir, "**")` as `data_files`.
            For builders that require manual download, it must be the path to the local directory containing the
            manually downloaded data.
        storage_options (`dict`, *optional*):
            Key/value pairs to be passed on to the dataset file-system backend, if any.
        writer_batch_size (`int`, *optional*):
            Batch size used by the ArrowWriter.
            It defines the number of samples that are kept in memory before writing them
            and also the length of the arrow chunks.
            None means that the ArrowWriter will use its default value.
        name (`str`): Configuration name for the dataset.

            <Deprecated version="2.3.0">

            Use `config_name` instead.

            </Deprecated>

        **config_kwargs (additional keyword arguments): Keyword arguments to be passed to the corresponding builder
            configuration class, set on the class attribute [`DatasetBuilder.BUILDER_CONFIG_CLASS`]. The builder
            configuration class is [`BuilderConfig`] or a subclass of it.
    N
deprecated	cache_dirconfig_namer   	base_pathinfofeaturesuse_auth_tokenrepo_idr_   r^   storage_optionswriter_batch_sizec                 ^	   |dk    rt          j        dt                     |}t          | j                            d          d                   | _        || _        || _        || _	        || _
        || _        |p| j        | _        |	9t          |	t                    s$t          j        t#          |	          ||          }	dt%          j        | j        j                  j        v r|||d<   |	|	|d<   |
|
|d	<    | j        d||d
|\  | _        | _        |i|                                 }|                    |                                            | j        |_        | j        j        |_        | j        j        |_        || _         ||| j         _!        tE          |pt0          j#                  | _$        tK          | j$                  r| j$        n#tL          j'        (                    | j$                  | _$        tK          | j$                  rtR          j*        ntL          j'        j*        }|r || j$        t0          j+                  ntE          t0          j,                  | _-        tK          | j-                  r| j-        n#tL          j'        (                    | j-                  | _-        | .                                | _/        tK          | j$                  stM          j0        | j$        d           tL          j'        *                    | j$        | j/        1                    tL          j2        d          dz             }tg          |          5  tL          j'        4                    | j/                  rtk          tM          j6        | j/                            dk    rqtL          j'        4                     || j/        t0          j7                            r8tp                               d           ts          j:        | j/                  | _         nDtp          ;                    d| j/         d| j         d           tM          j<        | j/                   d d d            n# 1 swxY w Y   | j/        | _=        t}          j?        d          | _@        d | _A        d| _B        t          |            d S )Nr   z\Parameter 'name' was renamed to 'config_name' in version 2.3.0 and will be removed in 3.0.0.)category.)r   r   r   r_   r^   )r   ru   Texist_ok_z.lockr   z<Overwrite dataset info from restored data version if exists.zOld caching folder z for dataset z- exists but no data were found. Removing it. fileFrO   )DwarningswarnFutureWarningr1   rM   splitr\   r   r   r   r   r   DEFAULT_WRITER_BATCH_SIZE_writer_batch_sizerb   r   from_local_or_remoter   inspect	signatureBUILDER_CONFIG_CLASS__init__
parameters_create_builder_configr   r   get_exported_dataset_infor   _infobuilder_namer   r]   r   r   r|   HF_DATASETS_CACHE_cache_dir_rootr9   ospath
expanduser	posixpathr   DOWNLOADED_DATASETS_DIRDOWNLOADED_DATASETS_PATH_cache_downloaded_dir_build_cache_dir
_cache_dirmakedirsreplacesepr:   existsr   listdirDATASET_INFO_FILENAMEloggerr)   from_directorywarningrmdir_output_dirfsspec
filesystem_fs
dl_manager_record_infosr6   )rd   r   r   r   r   r   r   r   r   r_   r^   r   r   r\   rt   	path_join	lock_paths                    rQ   r   zDatasetBuilder.__init__(  s   " <Mn&    K/0E0Ec0J0J20NOO	#'	",."3"Ut7U!*Z*O*O!&;!*--Sa  J
 *4+D+MNNYYY^f^r(0M*%!*4M,'(0M*%&Ad&A '
#$'
 '
 '
 '
#T^ <1133DKK

%%% $	D#{/D;.DL	!)DI  #9#H0HII$1$2F$G$GuD  RWM_M_`d`tMuMu 	 '4D4H&I&I[INNrw|	 6IId*F,JKKKV455 	" T788@D&&##D$>?? 	"
 //11T122 	2K,t<<<<T%94?;R;RSUSY[^;_;_bi;ijjI)$$ 
2 
27>>$/22 	22:do6677!;;7>>))DOVEa*b*bcc T"KK(fggg(3(B4?(S(SDI I$/  I  IPTPY  I  I  I   111
2 
2 
2 
2 
2 
2 
2 
2 
2 
2 
2 
2 
2 
2 
2  ?.4.?.G.G  # 	-T22222s   DQ  Q$'Q$c                     | j         S ri   )rp   rd   s    rQ   __getstate__zDatasetBuilder.__getstate__  s
    }rP   c                 2    || _         t          |            d S ri   )rp   r6   )rd   ds     rQ   __setstate__zDatasetBuilder.__setstate__  s    ,T22222rP   rv   c                     d S ri   rO   r   s    rQ   manual_download_instructionsz+DatasetBuilder.manual_download_instructions      trP   c                 N    t          j        |                                           S )a"  Empty dict if doesn't exist

        Example:

        ```py
        >>> from datasets import load_dataset_builder
        >>> ds_builder = load_dataset_builder('rotten_tomatoes')
        >>> ds_builder.get_all_exported_dataset_infos()
        {'default': DatasetInfo(description="Movie Review Dataset.
This is a dataset of containing 5,331 positive and 5,331 negative processed
sentences from Rotten Tomatoes movie reviews. This data was first used in Bo
Pang and Lillian Lee, ``Seeing stars: Exploiting class relationships for
sentiment categorization with respect to rating scales.'', Proceedings of the
ACL, 2005.
", citation='@InProceedings{Pang+Lee:05a,
  author =       {Bo Pang and Lillian Lee},
  title =        {Seeing stars: Exploiting class relationships for sentiment
                  categorization with respect to rating scales},
  booktitle =    {Proceedings of the ACL},
  year =         2005
}
', homepage='http://www.cs.cornell.edu/people/pabo/movie-review-data/', license='', features={'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None)}, post_processed=None, supervised_keys=SupervisedKeysData(input='', output=''), task_templates=[TextClassification(task='text-classification', text_column='text', label_column='label')], builder_name='rotten_tomatoes_movie_review', config_name='default', version=1.0.0, splits={'train': SplitInfo(name='train', num_bytes=1074810, num_examples=8530, dataset_name='rotten_tomatoes_movie_review'), 'validation': SplitInfo(name='validation', num_bytes=134679, num_examples=1066, dataset_name='rotten_tomatoes_movie_review'), 'test': SplitInfo(name='test', num_bytes=135972, num_examples=1066, dataset_name='rotten_tomatoes_movie_review')}, download_checksums={'https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz': {'num_bytes': 487770, 'checksum': 'a05befe52aafda71d458d188a1c54506a998b1308613ba76bbda2e5029409ce9'}}, download_size=487770, post_processing_size=None, dataset_size=1345461, size_in_bytes=1833231)}
        ```
        )r*   r   get_imported_module_dirclss    rQ   get_all_exported_dataset_infosz-DatasetBuilder.get_all_exported_dataset_infos  s!      .s/J/J/L/LMMMrP   c                 ~    |                                                      | j        j        t	                                S )a  Empty `DatasetInfo` if doesn't exist

        Example:

        ```py
        >>> from datasets import load_dataset_builder
        >>> ds_builder = load_dataset_builder('rotten_tomatoes')
        >>> ds_builder.get_exported_dataset_info()
        DatasetInfo(description="Movie Review Dataset.
This is a dataset of containing 5,331 positive and 5,331 negative processed
sentences from Rotten Tomatoes movie reviews. This data was first used in Bo
Pang and Lillian Lee, ``Seeing stars: Exploiting class relationships for
sentiment categorization with respect to rating scales.'', Proceedings of the
ACL, 2005.
", citation='@InProceedings{Pang+Lee:05a,
  author =       {Bo Pang and Lillian Lee},
  title =        {Seeing stars: Exploiting class relationships for sentiment
                  categorization with respect to rating scales},
  booktitle =    {Proceedings of the ACL},
  year =         2005
}
', homepage='http://www.cs.cornell.edu/people/pabo/movie-review-data/', license='', features={'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None)}, post_processed=None, supervised_keys=SupervisedKeysData(input='', output=''), task_templates=[TextClassification(task='text-classification', text_column='text', label_column='label')], builder_name='rotten_tomatoes_movie_review', config_name='default', version=1.0.0, splits={'train': SplitInfo(name='train', num_bytes=1074810, num_examples=8530, dataset_name='rotten_tomatoes_movie_review'), 'validation': SplitInfo(name='validation', num_bytes=134679, num_examples=1066, dataset_name='rotten_tomatoes_movie_review'), 'test': SplitInfo(name='test', num_bytes=135972, num_examples=1066, dataset_name='rotten_tomatoes_movie_review')}, download_checksums={'https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz': {'num_bytes': 487770, 'checksum': 'a05befe52aafda71d458d188a1c54506a998b1308613ba76bbda2e5029409ce9'}}, download_size=487770, post_processing_size=None, dataset_size=1345461, size_in_bytes=1833231)
        ```
        )r   getr   r\   r)   r   s    rQ   r   z(DatasetBuilder.get_exported_dataset_info  s/     2244889I;==YYYrP   c           	      &   d}|| j         r|s| j        J| j                            | j                  }t                              d| j         d|j                    nt          | j                   dk    r[d| j         d| j         d         j         d}t          d	t          | j        
                                           d
| dz             | j         d         }t                              d| j         d|j                    t          |t                    r\| j                            |          }|@| j         r9t          d| dt          | j        
                                                     |sN|||d<   n| j        r|s
| j        |d<   d|vr!t          | d          r| j        r
| j        |d<    | j        di |}ngt#          j        |          }|                                D ]>\  }}|7t          ||          st          d| d| d          t)          |||           ?|j        st          d|j                   |                    ||          }|| j        vo|dk    }	|	rt                              d|            ny|j        | j        v rL|| j        |j                 k    r6t          dt          | j        
                                                     |j        st          d|j         d          ||fS )a  Create and validate BuilderConfig object as well as a unique config id for this config.
        Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None.
        config_kwargs override the defaults kwargs in config
        Nz$No config specified, defaulting to: /r   load_dataset('', 'r   z')zEConfig name is missing.
Please pick one among the available configs: z
Example of usage:
	``z6No config specified, defaulting to the single config: zBuilderConfig z not found. Available: r\   r]   VERSIONz doesn't have a 'z' key.z$BuilderConfig must have a name, got )ru   r[   z Using custom data configuration zvCannot name a custom BuilderConfig the same as an available BuilderConfig. Change the name. Available BuilderConfigs: z must have a versionrO   )BUILDER_CONFIGSDEFAULT_CONFIG_NAMEbuilder_configsr   r   r   r\   r   rc   listrq   r   rb   r|   hasattrr   r   r   deepcopyr   setattrr   r]   )
rd   r   ru   rt   builder_configexample_of_usagekeyvaluer   	is_customs
             rQ   r   z%DatasetBuilder._create_builder_config  s     4#7'3!%!5!9!9$:R!S!SgdiggR`Regghhhht+,,q00'g	'g'gtG[\]G^Gc'g'g'g$$nJNtOcOhOhOjOjJkJkn nH5EHHHI  
 "&!5a!8vUYU^vvaoatvvwww k3'' 	!155kBBN%$*>% l[lldNbNgNgNiNiIjIjll  
  	8&(3f%%) A- A(,(@f%--'$	2J2J-t|-+/<i(6T6GGGGNN "]>::N+1133 8 8
U$">377 h()f.)f)f[^)f)f)fgggNC777" 	[YNDWYYZZZ #33+ 4 
 
	 d&::V	Y@V	 	]KKF9FFGGGG #t';;;"d&:>;N&OOO uQUVZVjVoVoVqVqQrQru u   ") ] ![.2E![![![\\\ y((rP   c                     d | j         D             }t          |          t          | j                   k    r#d | j         D             }t          d|           |S )z:Pre-defined list of configurations for this builder class.c                     i | ]
}|j         |S rO   r\   rk   r   s     rQ   rz   z2DatasetBuilder.builder_configs.<locals>.<dictcomp>  s    III66;IIIrP   c                     g | ]	}|j         
S rO   r  r  s     rQ   
<listcomp>z2DatasetBuilder.builder_configs.<locals>.<listcomp>  s    CCCVV[CCCrP   z5Names in BUILDER_CONFIGS must not be duplicated. Got )r   r   rc   )r   configsnamess      rQ   r   zDatasetBuilder.builder_configs  sj    
 JIS5HIIIw<<3s23333CCs/BCCCE\UZ\\]]]rP   c                     | j         S ri   )r   r   s    rQ   r   zDatasetBuilder.cache_dir  s
    rP   Tc                    | j         r>| j                             d          dk    r | j                             d          d         nd}|| j        n| d| j         }| j        }| j        }|rt          j        j        nt          j        }|r ||| j
                  }|r# ||t          | j        j                            }|r#|r!t          |t                    r |||          }|S )ao  Relative path of this dataset in cache_dir:
        Will be:
            self.name/self.config.version/self.hash/
        or if a repo_id with a namespace has been specified:
            self.namespace___self.name/self.config.version/self.hash/
        If any of these element is missing or if ``with_version=False`` the corresponding subfolders are dropped.
        r   r   N___)r   countr   r\   r   r   r   r   r   r   r   r|   r]   rb   )	rd   with_version	with_hashis_local	namespacebuilder_data_dirr   r   r   s	            rQ   _relative_data_dirz!DatasetBuilder._relative_data_dir#  s    37,h4<CUCUVYCZCZ]^C^C^DL&&s++A..dh	(1(9499)?[?[PTPY?[?[y$,@BGLL).	 	K(y)94>JJ 	U(y)93t{?R;S;STT 	A 	A*T3"7"7 	A(y)94@@rP   c           
      2   t          | j                   }|rt          j        j        nt
          j        } || j        |                     d|                     || j        |                     d|                    }fd}t                    s |            }|ru|d         d         }|| j        j        k    rWdt          |           d| j
         d| j         d	t          | j        j                   d
	}t                              |           |S )z2Return the data directory for the current version.F)r  r  Tc                     t           j                                      sg S g } t          j                  D ]<}	 |                     t          j        |          |f           -# t          $ r Y 9w xY w|                     d           | S )z"Returns previous versions on disk.T)reverse)	r   r   r   r   appendr   r   rc   sort)version_dirnamesdir_namer  s     rQ   _other_versions_on_diskz@DatasetBuilder._build_cache_dir.<locals>._other_versions_on_diskD  s    7>>"233 	!J'788  $++U]8-D-Dh,OPPPP!   D!!$!///##s   )A&&
A32A3r   zFound a different version z of dataset z in cache_dir z". Using currently defined version r   )r9   r   r   r   r   r   r  r   r]   r|   r\   r   r   )	rd   r  r   version_data_dirr  version_dirsother_versionwarn_msgr  s	           @rQ   r   zDatasetBuilder._build_cache_dir9  s_   $T%9:::$,@BGLL).	$9 $"9"9uW_"9"`"`
 
 %9 $"9"9tV^"9"_"_
 
	$ 	$ 	$ 	$ 	$ -.. 
	-2244L - ,Q 2 DK$7777S5G5G 7 7UYU^ 7 7%)%97 7t{2337 7 7 
 NN8,,,rP   c                     t           )a	  Construct the DatasetInfo object. See `DatasetInfo` for details.

        Warning: This function is only called once and the result is cached for all
        following .info() calls.

        Returns:
            info: (DatasetInfo) The dataset information
        NotImplementedErrorr   s    rQ   r   zDatasetBuilder._infoa  s
     "!rP   c                     t           j                            t          j        t          j        |                               S )z8Return the path of the module of this class or subclass.)r   r   dirnamer   getfile	getmoduler   s    rQ   r   z&DatasetBuilder.get_imported_module_dirm  s-     www/@/E/EFFGGGrP   srcdstc                 2    t          | j        ||           d S ri   )r'   r   )rd   r$  r%  s      rQ   _renamezDatasetBuilder._renamer  s    txc"""""rP   arrow
output_dirdownload_configdownload_modeverification_modetry_from_hf_gcsr   file_formatmax_shard_sizenum_procc                 n    |dk    r=|rt           j        nt           j        }t          j        d|j         dt                     |	dk    rt          j        dt                     n j        }	||n j        }t          j
        ||          }|d          _        t           j                   r|d         d         n% j                            |d         d                    _        t          |pt          j                  }t          |pt           j                  }||n j        }|
|
d	vrt'          d
|
 d           j                             j                  dk    r(t+          d j         d j         j        z    d          |v|<t/           j        |t          j        k    |t          j        k    d||	 j                  }t7           j        | j        j        | j        p|t           j        k              }t?          |t@                    s
r|
dk    s|d}| _!        r8tE           j                  j#        $                    dd            j        dz   }rtK          |          ntM          j'                    5  rtP          j)        j*        ntV          j*        } j        ,                     | j        t8          j-                            }|rw|t          j        k    rgt\          /                    d j         d j         d            0                                 _1         2                    |           	 ddd           dS t\          1                    d j         d j         d           rtg           j1        j4        pdtE           j                  j#                  stk          dtm           j1        j4        pd           dtm           j1        j7        pd           dtm           j1        j8        pd           dtm           j1        j9        pd           d	          tL          j:         fd             } j1        j4        rtw          d! j1        j<         d" j1        j=         dtm           j1        j7                   dtm           j1        j8                   dtm           j1        j9                   d#tm           j1        j4                   d$ j         d%           nUr j                             j                  n j        }tw          d! j1        j<         d" j1        j=         d&| d%            >                    |            | j                  5 }t           d'|          5  d}|rt	  @                    |jA                   d}nV# t          t          f$ r t\          1                    d(           Y n)t          $ r t\          /                    d)           Y nw xY w|s$d*|
i}|||d+<   |||d,<     jE        d2||d-|| t          d.  j1        jG        H                                D                        j1        _8        |I                                 j1        _J         j1        j8         j1        j7        z    j1        _4         K                                 ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y    2                    |           tw          d/ j         d0 j         d1           ddd           dS # 1 swxY w Y   dS )3a  Downloads and prepares dataset for reading.

        Args:
            output_dir (`str`, *optional*):
                Output directory for the dataset.
                Default to this builder's `cache_dir`, which is inside `~/.cache/huggingface/datasets` by default.

                <Added version="2.5.0"/>
            download_config (`DownloadConfig`, *optional*):
                Specific download configuration parameters.
            download_mode ([`DownloadMode`] or `str`, *optional*):
                Select the download/generate mode, default to `REUSE_DATASET_IF_EXISTS`.
            verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
                Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...).

                <Added version="2.9.1"/>
            ignore_verifications (`bool`, defaults to `False`):
                Ignore the verifications of the downloaded/processed dataset information (checksums/size/splits/...).

                <Deprecated version="2.9.1">

                `ignore_verifications` was deprecated in version 2.9.1 and will be removed in 3.0.0.
                Please use `verification_mode` instead.

                </Deprecated>
            try_from_hf_gcs (`bool`):
                If `True`, it will try to download the already prepared dataset from the HF Google cloud storage.
            dl_manager (`DownloadManager`, *optional*):
                Specific `DownloadManger` to use.
            base_path (`str`, *optional*):
                Base path for relative paths that are used to download files. This can be a remote url.
                If not specified, the value of the `base_path` attribute (`self.base_path`) will be used instead.
            use_auth_token (`Union[str, bool]`, *optional*):
                Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
                If True, or not specified, will get token from ~/.huggingface.

                <Deprecated version="2.7.1">

                Pass `use_auth_token` to the initializer/`load_dataset_builder` instead.

                </Deprecated>
            file_format (`str`, *optional*):
                Format of the data files in which the dataset will be written.
                Supported formats: "arrow", "parquet". Default to "arrow" format.
                If the format is "parquet", then image and audio data are embedded into the Parquet files instead of pointing to local files.

                <Added version="2.5.0"/>
            max_shard_size (`Union[str, int]`, *optional*):
                Maximum number of bytes written per shard, default is "500MB".
                The size is based on uncompressed data size, so in practice your shard files may be smaller than
                `max_shard_size` thanks to Parquet compression for example.

                <Added version="2.5.0"/>
            num_proc (`int`, *optional*, defaults to `None`):
                Number of processes when downloading and generating the dataset locally.
                Multiprocessing is disabled by default.

                <Added version="2.7.0"/>
            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the caching file-system backend, if any.

                <Added version="2.5.0"/>
            **download_and_prepare_kwargs (additional keyword arguments): Keyword arguments.

        Example:

        Download and prepare the dataset as Arrow files that can be loaded as a Dataset using `builder.as_dataset()`:

        ```py
        >>> from datasets import load_dataset_builder
        >>> builder = load_dataset_builder("rotten_tomatoes")
        >>> ds = builder.download_and_prepare()
        ```

        Download and prepare the dataset as sharded Parquet files locally:

        ```py
        >>> from datasets import load_dataset_builder
        >>> builder = load_dataset_builder("rotten_tomatoes")
        >>> ds = builder.download_and_prepare("./output_dir", file_format="parquet")
        ```

        Download and prepare the dataset as sharded Parquet files in a cloud storage:

        ```py
        >>> from datasets import load_dataset_builder
        >>> storage_options = {"key": aws_access_key_id, "secret": aws_secret_access_key}
        >>> builder = load_dataset_builder("rotten_tomatoes")
        >>> ds = builder.download_and_prepare("s3://my-bucket/my_rotten_tomatoes", storage_options=storage_options, file_format="parquet")
        ```
        r   z'ignore_verifications' was deprecated in favor of 'verification_mode' in version 2.9.1 and will be removed in 3.0.0.
You can remove this warning by passing 'verification_mode=
' instead.z'use_auth_token' was deprecated in version 2.7.1 and will be removed in 3.0.0. Pass `use_auth_token` to the initializer/`load_dataset_builder` instead.Nr   r      )r(  parquetzUnsupported file_format: z. Expected 'arrow' or 'parquet' z7Unable to download and prepare the dataset at the root z'. Please specify a subdirectory, e.g. ''F)r   force_downloadforce_extractuse_etagr0  r   r   )dataset_namer*  r^   r   record_checksumsr(  T)parentsr   z_builder.lockzFound cached dataset z ()zGenerating dataset )	directoryzNot enough disk space. Needed: z (download: z, generated: z, post-processed: c              3      K   s"j                             | d           | V  dS | dz   }t          j        |d           	 |V  t          j                            |           rt          j        |            t          j        ||            t          j                            |          rt          j        |           dS dS # t          j                            |          rt          j        |           w w xY w)z4Create temporary dir for dirname and rename on exit.Tr   z.incompleteN)	r   r   r   r   isdirshutilrmtreemover   )r!  tmp_dirr  rd   s     rQ   incomplete_dirz;DatasetBuilder.download_and_prepare.<locals>.incomplete_dir@  s        3H%%g%===!MMMMM%5GK$77773%7==11 3"M'222GW5557>>'22 3"M'222223 327>>'22 3"M'22223s   AC 6C=z"Downloading and preparing dataset r   z	, total: z) to z... to r   zJDataset not on Hf google storage. Downloading and preparing it from sourcezGHF google storage unreachable. Downloading and preparing it from sourcer.  r/  r0  )r   r,  c              3   $   K   | ]}|j         V  d S ri   )	num_bytes)rk   r   s     rQ   rn   z6DatasetBuilder.download_and_prepare.<locals>.<genexpr>  s$      0h0hU0h0h0h0h0h0hrP   Dataset z downloaded and prepared to z(. Subsequent calls will reuse this data.rO   )Lr;   	NO_CHECKS
ALL_CHECKSr   r   r   r   r   r   r   get_fs_token_pathsr   r&   unstrip_protocolr   r!   REUSE_DATASET_IF_EXISTSBASIC_CHECKSr   rc   _strip_protocolRuntimeErrorr\   r   r   FORCE_REDOWNLOADr   r    r   r^   r   rb   r"   r   r   parentmkdirr:   
contextlibnullcontextr   r   r   r   r   r   r   r   
_load_infor   "download_post_processing_resourcesrA   size_in_bytesOSErrorrE   download_sizedataset_sizepost_processing_sizecontextmanagerprintr   r   _check_manual_downloadrF   _download_prepared_from_hf_gcsr*  r   r   ConnectionError_download_and_preparesumsplitsr   get_recorded_sizes_checksumsdownload_checksums
_save_info)rd   r)  r*  r+  r,  ignore_verificationsr-  r   r   r   r.  r/  r0  r   download_and_prepare_kwargsfs_token_pathsr   r   data_existsrF  _desttmp_output_dirdownloaded_from_gcsprepare_split_kwargsr  s   `                       @rQ   download_and_preparez#DatasetBuilder.download_and_prepareu  s	   X  <//>R s 0 : :XhXsMqM^Mdq q q  
 \))M j   
 "0N#-#9ZZt
2:___.<Q.?+DH5553;p>!,Q//AZAZ[ijk[lmn[oApAp$]%Zl6Z[[,->-_BRB_``!*!6IIDN	"{:N'N'Neeeefff8##D$455;; X$JZ X X8<8H498TX X X  
 &"0"8#0L4Q#Q"/<3P"P"%#1$($8# # # )!Y /-#"&"4"h8IM]Mh8h  J z#677	$	$ g%%)#O$  	;!"")//t/LLL(?:I %-JXi   *2H2J2J b	 b	(0DinI(//))D4DfFb*c*cddK }0TTTWtyWWDDTWWWXXX !OO--	77
CCCb	 b	 b	 b	 b	 b	 b	 b	 KKNdiNN4;KNNNOOO 0I+0qDAQ<R<R<Y    " M(49CZC_^_:`:`  M  Mnvw{  xA  xO  xT  ST  oU  oU  M  M  dl  mq  mv  mC  mH  GH  dI  dI  M  M  ]e  fj  fo  fD  fI  HI  ]J  ]J  M  M  M   &3 3 3 3 3 '&3* y& \9O \ \RVR[Rg \ \"*49+B"C"C\ \RZ[_[d[qRrRr\ \'/	0N'O'O\ \ 'ty'>??\ \ GKFV\ \ \    GOd001ABBBTXTdw9OwwRVR[Rgwwmrwww   ''
333   011 &^ *$~NN & &*/'& vv ??
@Z[[[26// 68PQ v v v"KK(tuuuuu. v v v"NN+tuuuuuv. 0={/K,)5ES01AB#/?G0<22 '1.?  3 :	   .10h0hdiN^NeNeNgNg0h0h0h-h-hDI*3=3Z3Z3\3\DI0.2i.DtyG^.^DI+OO%%%9& & & & & & & & & & & & & & && & & & & & & & & & & & & & &D 33J???:49 : :$BR : : :  b	 b	 b	 b	 b	 b	 b	 b	 b	 b	 b	 b	 b	 b	 b	 b	 b	 b	s   B?\*H\*5[[V*)[*+W=	[#W=	:[<W=	=B=[:[[

[[
[\*[!	!\*$[!	%8\**\.1\.c                     | j         N|j        It          t          j        d| j         d| j        j         d| j          d| j         d	                    d S d S )Nz                     The dataset z with config zp requires manual data.
                    Please follow the manual download instructions:
                     za
                    Manual data can be loaded with:
                     datasets.load_dataset("z$", data_dir="<path/to/manual/data>"))r   
manual_dirrV   textwrapdedentr\   r   rd   r   s     rQ   ra  z%DatasetBuilder._check_manual_download  s    ,8Z=R=Z%_!%_ _9=9I_ _ 7_ _
 .2Y_ _ _ 	 	 	 98=Z=ZrP   c           	      .   |                      dd          }t          | j        | j                  }|                    ||           t          j        | j                  }| j                            |           t          dz   |	                    t          j        d          z   }| j        j        D ]}|                     |                                          D ]}t          j        |v rt          d|           	 t!          |dz   |z             }t#          j        |t          j                            | j        |                     q# t*          $ r! t,                              d| d           Y w xY wt,                              d           d S )	NTF)r  r  r   +Resources shouldn't be in a sub-directory: z Couldn't download resourse file z from Hf google storage.z*Dataset downloaded from Hf google storage.)r  r   r   r   download_from_hf_gcsr)   r   r   r   r   r   r   rf  _post_processing_resourcesr   rc   r8   rB  rD  r   r   rc  r   )	rd   r*  relative_data_dirreaderdownloaded_inforemote_cache_dirr   resource_file_nameresource_paths	            rQ   rb  z-DatasetBuilder._download_prepared_from_hf_gcs  s    33QV3WWT-ty99##O5FGGG%4T5EFF	)))*S03D3L3LRVUX3Y3YYY% 	q 	qE&*&E&Ee&L&L&S&S&U&U q q"6///$%gSe%g%ghhhq$/0@30FI[0[$\$\MKrw||D<LN`/a/abbbb& q q qKK oCU o o opppppqq 	@AAAAAs   >AE(E76E7c           	         t          | j                  }|                     |          } | j        |fi |}|t          j        k    r4|j        r-t          | j        j	        |
                                d           |D ]}t          |j        j                                                  dk    rt          d          t                              d|j        j         d           |                    |j                   	  | j        |fi | nr# t$          $ r2}t%          d| j        pdz   d	z   t          |          z             d
d
}~wt(          $ r+}t)          |j        |j        d| j         d          d
d
}~ww xY w|                                 |t          j        k    s|t          j        k    rt3          | j        j        |           || j        _        |j        | j        _        d
S )a  Downloads and prepares dataset for reading.

        This is the internal implementation to overwrite called when user calls
        `download_and_prepare`. It should download all required data and generate
        the pre-processed datasets files.

        Args:
            dl_manager ([`DownloadManager`]):
                `DownloadManager` used to download and cache data.
            verification_mode ([`VerificationMode`]):
                if `ALL_CHECKS`, perform all the verifications including checksums.
                if `BASIC_CHECKS`, do not perform checksums, only perform split tests.
                if `NO_CHECKS`, do not perform any verification.
            prepare_split_kwargs: Additional options, such as `file_format`, `max_shard_size`
        )r;  zdataset source filesrr   z{`all` is a special split keyword corresponding to the union of all splits, so cannot be used as key in ._split_generator().Generating  splitzCannot find data file. r6  z
Original error:
Nz7To avoid duplicate keys, please fix the dataset script z.py)fix_msg)r3   r\   _make_split_generators_kwargs_split_generatorsr;   rL  r<  r=   r   rh  rg  r|   
split_infolowerrc   r   add_prepare_splitr[  r   r/   r   duplicate_key_indicesmanage_extracted_filesrP  r>   rf  downloaded_sizer\  )	rd   r   r,  rq  
split_dictsplit_generators_kwargssplit_generatorssplit_generatores	            rQ   rd  z$DatasetBuilder._download_and_prepare  sS   " DI666
"&"D"DEY"Z"Z141*XX@WXX  0 ;;;
@[;	,j.U.U.W.WYo  
  0 	0 	0O?-23399;;uDD +   KKMo&@&EMMMNNNNN?5666##OLL7KLLLL   -8>B@+, !ff 
  '   )E+dVZV_ddd   	 --//// 0 ===ARVfVqAqAq$)*J777 &	","<	s$   D
F-E		F&E<<Fc                 @   | j         j        pg D ]}|                     |                                          D ]\  }}t	          | j                  rt          d| j                   t          j        |v rt          d|           t          j
                            | j        |          }t          j
                            |          sN|                     |||          }|r5t                               d| d|            t!          j        ||           d S )Nz/Post processing is not supported on filesystem ry  z$Downloaded post-processing resource z as )r   rf  r{  r   r&   r   r  r   r   rc   r   r   r   r   #_download_post_processing_resourcesr   rB  rD  )rd   r   r   resource_namer  r  downloaded_resource_paths          rQ   rY  z1DatasetBuilder.download_post_processing_resources  sE   Y%+ 	M 	ME595T5TUZ5[5[5a5a5c5c M M11/99 l-.j`d`h.j.jkkk6///$%gSe%g%ghhh "T-=?Q R Rw~~m44 M/3/W/W}j0 0, 0 M$r=$r$r^p$r$rsss$<mLLLM	M 	MrP   c                 L    t          j        | j        | j        j                  S )Nr3  )r)   r   r   r   r   r   s    rQ   rX  zDatasetBuilder._load_info  s     )$*:DHLdeeeerP   c                    t          | j                   }|r
| j        dz   }|rt          |          nt	          j                    5  | j                            | j        | j        j                   d d d            d S # 1 swxY w Y   d S )Nz
_info.lockr3  )	r&   r   r   r:   rV  rW  r   write_to_directoryr   rd   r  r   s      rQ   ri  zDatasetBuilder._save_info  s    +DH555 	8(<7I$,JXi   *2H2J2J 	e 	eI(()948Kc(ddd	e 	e 	e 	e 	e 	e 	e 	e 	e 	e 	e 	e 	e 	e 	e 	e 	e 	es   ,A??BBc                 F   t          | j                   }|r
| j        dz   }|rt          |          nt	          j                    5  t          di | j        j        | j	        i
                    |                                            d d d            d S # 1 swxY w Y   d S )Nz_infos.lockrO   )r&   r   r   r:   rV  rW  r*   r   r\   r   r  r   r  s      rQ   _save_infoszDatasetBuilder._save_infos  s    +DH555 	9(=8I$,JXi   *2H2J2J 	q 	q== 0$)<==PPQUQmQmQoQoppp	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	qs   ABBBc                     ~i S )zFGet kwargs for `self._split_generators()` from `prepare_split_kwargs`.rO   )rd   rq  s     rQ   r  z,DatasetBuilder._make_split_generators_kwargs  s
     	rP   Fr   c                 8   |dk    r8|r|j         nt          j        }t          j        d|j         dt                     t          | j                   }|s*t          dt          | j                  j         d          t          j                            | j                  s t!          d| j         d| j         d          t$                              d	|pd
                    | j        j                   d| j                    |d | j        j        D             }t          |pt          j                  }t1          t3          | j        |||          |dt7          j                               }t;          |t<                    rt?          |          }|S )a&  Return a Dataset for the specified split.

        Args:
            split (`datasets.Split`):
                Which subset of the data to return.
            run_post_process (`bool`, defaults to `True`):
                Whether to run post-processing dataset transforms and/or add
                indexes.
            verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
                Verification mode determining the checks to run on the
                downloaded/processed dataset information (checksums/size/splits/...).

                <Added version="2.9.1"/>
            ignore_verifications (`bool`, defaults to `False`):
                Whether to ignore the verifications of the
                downloaded/processed dataset information (checksums/size/splits/...).

                <Deprecated version="2.9.1">

                `ignore_verifications` was deprecated in version 2.9.1 and will be removed in 3.0.0.
                Please use `verification_mode` instead.

                </Deprecated>
            in_memory (`bool`, defaults to `False`):
                Whether to copy the data in-memory.

        Returns:
            datasets.Dataset

        Example:

        ```py
        >>> from datasets import load_dataset_builder
        >>> builder = load_dataset_builder('rotten_tomatoes')
        >>> ds = builder.download_and_prepare()
        >>> ds = builder.as_dataset(split='train')
        >>> ds
        Dataset({
            features: ['text', 'label'],
            num_rows: 8530
        })
        ```
        r   z'ignore_verifications' was deprecated in favor of 'verification' in version 2.9.1 and will be removed in 3.0.0.
You can remove this warning by passing 'verification_mode=r2  zLoading a dataset cached in a z is not supported.rJ  z: could not find data in z. Please make sure to call builder.download_and_prepare(), or use datasets.load_dataset() before trying to access the Dataset object.zConstructing Dataset for split z, z, from Nc                     i | ]}||S rO   rO   )rk   ss     rQ   rz   z-DatasetBuilder.as_dataset.<locals>.<dictcomp>_  s    444aQ444rP   )run_post_processr,  	in_memoryT)	map_tupledisable_tqdm) rK  r;   rL  r   r   r   r   r&   r   r  typerL   r   r   r   r   FileNotFoundErrorr\   r   debugr   r   rf  rP  rC   r   _build_single_datasetr7   is_progress_bar_enabledrb   r   r   )rd   r   r  r,  rj  r  r  datasetss           rQ   
as_datasetzDatasetBuilder.as_dataset  s   f  <//?S t 1 ; ;YiYtMqM^Mdq q q  
 ,DH555 	t%&rtDH~~G^&r&r&rsssw~~d.// 	#V49 V Vt?O V V V   	vu7[		$)JZ@[@[vvdhdtvvwww =4449#3444E,->-_BRB_`` *!1"3#	   $<>>>

 

 

 h%% 	-"8,,HrP   r  r  c                     t          |t                    sUt          |          }|dk    r1d                     j        j                                                  }t          |          }                     ||          }|r 	                    |          
                                D ]"}t          j        |v rt          d|           # fd 	                    |                                          D             }                     ||          }|]|}i }	d}
|                                D ]\  }}t!          |          }||	|<   |t"          j        k    rW|
rU j        j         j        j        j        d}n$ j        j        j                            |          }t-          ||	d            j        j        t/                       j        _         j        j        j        i  j        j        _        |	 j        j        j        t          |          <   t1          d	  j        j        j        
                                D                        j        _         j        j        < j        j        0 j        j         j        j        z    j        j        z    j        _                                           j        j        |j        _         j        j        |j        _         j        j        |j        _         j        j        j        h j        j        j        j         |j        j         k    r)t          d
 j        j        j         d|j                    j        j        j        |j        _        |S )zas_dataset for a single split.rr   +)r   r  ry  c                 b    i | ]+\  }}|t           j                            j        |          ,S rO   )r   r   r   r   )rk   r  r  rd   s      rQ   rz   z8DatasetBuilder._build_single_dataset.<locals>.<dictcomp>  sD       5M#5 rw||D,<>PQQ  rP   NFzpost processing resourcesc              3   T   K   | ]#}|                                 D ]}|d          V  $dS )rI  N)r   )rk   split_checksums_dictschecksums_dicts      rQ   rn   z7DatasetBuilder._build_single_dataset.<locals>.<genexpr>  s^       5 5-*?*F*F*H*H5 5 ' #;/5 5 5 5 5 5 5rP   z:Post-processed features info don't match the dataset:
Got
z
but expected something like
)!rb   r   r|   r   r   rf  rq   r2   _as_datasetr{  r   r   r   rc   r   _post_processr<   r;   rL  post_processedresources_checksumsr   r=   r+   re  r^  r]  r\  rZ  ri  r   r   r  )rd   r   r  r,  r  dsr  resources_pathsr  recorded_checksumsr<  r  r  size_checksumexpected_checksumss   `              rQ   r  z$DatasetBuilder._build_single_datasets  s    %11 	!JJE~~!1!6!6!8!899%LLE   
 
  .	M&*&E&Ee&L&L&S&S&U&U i i"6///$%gSe%g%ghhh 0   9=9X9XY^9_9_9e9e9g9g  O "//ODDN)#%'"#( 4C4I4I4K4K F F0M=$:=$I$IM8E&}55$(8(CCCHXCy/749;S;g;o-1**-1Y-E-Y-]-]^c-d-d*$%79KMhiii9+3/@/B/BDI,9+?GCEDI,@K]	(<SZZH14 5 5151I1]1d1d1f1f5 5 5 2 2	.
 9)5$):Q:]	.1HH49Kii I+ !!!*.)*B'04	0N-)-)@&9+4@y/8=AQQQ( k[_[d[s[|  k  k  ^`  ^i  k  k   ,09+C+L(	rP   c                     | j                             | j                  }t          || j                                      | j        || j        j                                        |          }| 	                    |          }t          dd|i|S )a  Constructs a `Dataset`.

        This is the internal implementation to overwrite called when user calls
        `as_dataset`. It should read the pre-processed datasets files and generate
        the `Dataset` object.

        Args:
            split (`datasets.Split`):
                which subset of the data to read.
            in_memory (`bool`, defaults to `False`):
                Whether to copy the data in-memory.

        Returns:
            `Dataset`
        )r\   instructionssplit_infosr  fingerprintrO   )r   rQ  r   r   r   readr\   rf  r   _get_dataset_fingerprintr   )rd   r   r  r   dataset_kwargsr  s         rQ   r  zDatasetBuilder._as_dataset  s      H,,T-=>>	$Y	::??	(//11	 @ 
 
 33E::AA;A.AAArP   c                    t                      }|                    |                                                     t          j        d                     |                    t          |                     |                                }|S )zThe dataset fingerprint is the hash of the relative directory dataset_name/config_name/version/hash, as well as the split specs.r   )r(   r   r  r   r   r   r|   r   )rd   r   hasherr  s       rQ   r  z'DatasetBuilder._get_dataset_fingerprint  sj    d--//77DDEEEc%jj!!!&&((rP   c                 R   t          | j                   }|s*t          dt          | j                  j         d          t          |p| j        t          | j        | j	                  | j
        | j        j                  }|                     |           d |                     |          D             }||}n/||v r	||         }n"t          d| dt!          |                     t#          | j        |d	          }t'          |t(                    rt+          |          }|S )
Nz(Loading a streaming dataset cached in a z is not supported yet.)r   r   )r   r*  r;  r^   c                     i | ]
}|j         |S rO   r  )rk   sgs     rQ   rz   z7DatasetBuilder.as_streaming_dataset.<locals>.<dictcomp>  s    VVVRRWbVVVrP   Bad split: . Available splits: T)r  )r&   r   r  r  rL   r#   r   r   r   r   r\   r   r^   ra  r  rc   r   rC   _as_streaming_dataset_singlerb   r   r   )rd   r   r   r  r   splits_generatorssplits_generatorr  s           rQ   as_streaming_datasetz#DatasetBuilder.as_streaming_dataset  s]   
 ,DH555 	%j4>>;Rjjj   .14>*$:M_c_sttt[)	
 
 

 	##J///VV43I3I*3U3UVVV=0'''07_5__dK\F]F]__``` -
 
 

 h%% 	5*844HrP   c                     |                      |          }| j        r| j        | j        ini }t          || j        |j        |          S )N)r   r   token_per_repo_id) _get_examples_iterable_for_splitr   r   r-   r   r\   )rd   r  ex_iterabler  s       rQ   r  z+DatasetBuilder._as_streaming_dataset_single  s\     ;;<LMMCG<WT\4+>??UWdi/?/DXi
 
 
 	
rP   datasetr  c                     dS )z%Run dataset transforms or add indexesNrO   )rd   r  r  s      rQ   r  zDatasetBuilder._post_process
  r   rP   c                     i S )z+Mapping resource_name -> resource_file_namerO   rd   r   s     rQ   r{  z)DatasetBuilder._post_processing_resources  s    	rP   r  c                     dS )zPDownload the resource using the download manager and return the downloaded path.NrO   )rd   r   r  r   s       rQ   r  z2DatasetBuilder._download_post_processing_resources  s	     trP   c                     t                      )a  Specify feature dictionary generators and dataset splits.

        This function returns a list of `SplitGenerator`s defining how to generate
        data and what splits to use.

        Example:

            return [
                    datasets.SplitGenerator(
                            name=datasets.Split.TRAIN,
                            gen_kwargs={'file': 'train_data.zip'},
                    ),
                    datasets.SplitGenerator(
                            name=datasets.Split.TEST,
                            gen_kwargs={'file': 'test_data.zip'},
                    ),
            ]

        The above code will first call `_generate_examples(file='train_data.zip')`
        to write the train data, then `_generate_examples(file='test_data.zip')` to
        write the test data.

        Datasets are typically split into different subsets to be used at various
        stages of training and evaluation.

        Note that for datasets without a `VALIDATION` split, you can use a
        fraction of the `TRAIN` data for evaluation as you iterate on your model
        so as not to overfit to the `TEST` data.

        For downloads and extractions, use the given `download_manager`.
        Note that the `DownloadManager` caches downloads, so it is fine to have each
        generator attempt to download the source data.

        A good practice is to download all data in this function, and then
        distribute the relevant parts to each split with the `gen_kwargs` argument

        Args:
            dl_manager (`DownloadManager`):
                Download manager to download the data

        Returns:
            `list<SplitGenerator>`.
        r  rw  s     rQ   r  z DatasetBuilder._split_generators  s    Z "###rP   r  c                     t                      )a!  Generate the examples and record them on disk.

        Args:
            split_generator (`SplitGenerator`):
                Split generator to process
            file_format (`str`, *optional*):
                format of the data files in which the dataset will be written.
                Supported formats: "arrow", "parquet". Default to "arrow" format.
            max_shard_size (`Union[str, int]`, *optional*):
                Maximum number of bytes written per shard, default is "500MB".
                The size is based on uncompressed data size, so in practice your shard files may be smaller than
                `max_shard_size` thanks to Parquet compression for example.
            num_proc (`int`, *optional*, defaults to `None`):
                Number of processes when downloading and generating the dataset locally.
                Multiprocessing is disabled by default.

                <Added version="2.7.0"/>
            **kwargs: Additional kwargs forwarded from _download_and_prepare (ex:
                beam pipeline)
        r  )rd   r  r.  r/  r0  kwargss         rQ   r  zDatasetBuilder._prepare_splitG      : "###rP   c                     t                      )zGenerate the examples on the fly.

        Args:
            split_generator (`SplitGenerator`):
                Split generator to process
        r  rd   r  s     rQ   r  z/DatasetBuilder._get_examples_iterable_for_splitf  s     "###rP   )NNNNNNNNNNNNr   )NN)TTT)NNNNr   TNNr   r(  NNN)NTNr   F)Fr(  NN)MrL   rM   rN   r   r   rZ   r   r   r   r   r	   r|   r)   r%   r   r}   r   r   r   r~   r   r   r   propertyr   classmethodr*   r   r   r
   r   r?   rD   r   r   r  r   abcabstractmethodr   r   r'  r   r!   r;   r    rr  ra  rb  rd  rY  rX  ri  r  r  r2   r   r   r  r   r  TRAINr  r  r   r-   r  r  r   r  r{  r  r  r4   r  r,   r  rO   rP   rQ   r   r      s       J JZ G ) O  !% $(%)"#'&*'+59!%FJ"&*.+/k3 k3C=k3 c]k3 sm	k3
 C=k3 {#k3 8$k3 !tSy!12k3 #k3 U3dM#ABCk3 3-k3 "$k3 $C=k3 k3 k3 k3Z  3 3 3 hsm    X N/? N N N [NZ; Z Z Z Z 15N) N)	}c!	"N) N) N) N)` WYY  Y [ ]   X   VY        ,&  &  & P 		"{ 	" 	" 	" 	" H H [H#3 #S # # # #
 %)48<@DH) $04#'#"48"&*.X XSMX ".1X  lC&7 89	X
 $E*:C*?$@AX X _-X C=X X !sCx1X 3-X "$X X X Xt  Bn B B B B(?= ?= ?=BM M M fK f f f fe e eq q q   "&DH)Z ZZ $E*:C*?$@A	Z 
w#	$Z Z Z ZB  C CS/501C C ,	C
 C C C CJ BG`e B B'=!> BY] Bjq B B B B4eOU4J.K PS      $#'# #}# C=# 
tC()?:	;	# # # #J	
 
	
 	
 	
 	
W wsCx?P U]^eUf     S#X    ),:I	#    	,$O ,$ ,$ ,$ ,$\ 	 #48"&$ $'$ $ !sCx1	$
 3-$ $ $ $<$ $Sc $ $ $ $ $ $rP   r   c                        e Zd ZdZej        d             Z	 	 	 ddedede	e
         de	ee
ef                  fd	Zd
ededede
dedede
deee
eee
ef         f                  fdZ fdZdedefdZ xZS )GeneratorBasedBuilderaw  Base class for datasets with data generation based on dict generators.

    `GeneratorBasedBuilder` is a convenience class that abstracts away much
    of the data writing and reading of `DatasetBuilder`. It expects subclasses to
    implement generators of feature dictionaries across the dataset splits
    (`_split_generators`). See the method docstrings for details.
    c                     t                      )ag  Default function generating examples for each `SplitGenerator`.

        This function preprocess the examples from the raw data to the preprocessed
        dataset files.
        This function is called once for each `SplitGenerator` defined in
        `_split_generators`. The examples yielded here will be written on
        disk.

        Args:
            **kwargs (additional keyword arguments):
                Arguments forwarded from the SplitGenerator.gen_kwargs

        Yields:
            key: `str` or `int`, a unique deterministic example identification key.
                * Unique: An error will be raised if two examples are yield with the
                    same key.
                * Deterministic: When generating the dataset twice, the same example
                    should have the same key.
                Good keys can be the image id, or line number if examples are extracted
                from a text file.
                The key will be hashed and sorted to shuffle examples deterministically,
                such as generating the dataset multiple times keep examples in the
                same order.
            example: `dict<str feature_name, feature_value>`, a feature dictionary
                ready to be encoded and written to disk. The example will be
                encoded with `self.info.features.encode_example({...})`.
        r  rd   r  s     rQ   _generate_examplesz(GeneratorBasedBuilder._generate_examplesy  r  rP   r(  Nr  check_duplicate_keysr0  r/  c                 \    !" t          |pt          j                  }t           j                   }|rt
          j        j        nt          j        } j	        j
         j	        j
        |j                 }n|j        }d}	 j         d|j         |	 d| }
 | j        |
           |r|dk    r{t          |j                  }|dk    r+|)t                               d| d|j         d           d}n6|4||k     r.t           	                    d| d| d	|j         d
| d	           |}t%          j        t%          j                     d|j        dd|j         d          } ||||d||dk    rvd }|j        }d}|5    j        d(||dD ] \  }}}|r|}|                    |           !	 d d d            n# 1 swxY w Y   |
J d            d |D             \  }}}!}nfdt1          t3          |j        |                    D             }t5          |          }d g|z  }d g|z  }d g|z  }d g|z  !d g|z  }t7          |          5 }|5  t9          | j        |          D ]5\  }}}|r|\  ||<   ||<   ||<   !|<   ||<    |                    |           6	 d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   d |vsJ d| d            t;          !          "t;          |          }t;          |          }|d         }||j        _        ||j        _        t                               d" d           "dk    r^dt@          tB                   f  !"fd}d t1          !          D             }tE          ||dd !           d" |D             |j        _#        n[d#\  }} $                     %                    d$|d%          %                    d&|d%           %                    |	d'                      j	        j&        | j	        _&        d S d S ))N-JJJJJ-SSSSS-of-NNNNNr   r   r   Setting num_proc from  back to 1 for the @ split to disable multiprocessing as it only contains one shard.rG  	 for the  split as it only contains  shards.	 examplesFr  r  disableunittotalleavedesc)fpathr.  r/  r  r  r   
gen_kwargsjob_id-Failed to retrieve results from prepare_splitc                     g | ]}|gS rO   rO   rk   items     rQ   r  z8GeneratorBasedBuilder._prepare_split.<locals>.<listcomp>  -     h h hh h hrP   c                 $    g | ]\  }}||d S r  rO   rk   r  r  _prepare_split_argss      rQ   r  z8GeneratorBasedBuilder._prepare_split.<locals>.<listcomp>  ;       &FJ  *VSS?RS  rP   max_num_jobskwargs_iterable;Failed to retrieve results from prepare_split: result list G still contains None - at least one worker failed to return its results	Renaming shard_and_jobc                 "   | \  }}t          d |                   |z   }                                        d|d                              d|d                              d|d                              dd                     d S NSSSSS05dJJJJJzJJJJJ-SSSSSNNNNNre  r'  r   )r  shard_idr  global_shard_idr  rd   shards_per_jobtotal_shardss       rQ   _rename_shardz;GeneratorBasedBuilder._prepare_split.<locals>._rename_shard  s    #0 &"%nWfW&=">">"IMM'h+<+<==EEgRX__MM-O1I1IJJRRSZ_k\q\qrr    rP   c                 @    g | ]\  }}t          |          D ]}||fS rO   rangerk   r  
num_shardsr  s       rQ   r  z8GeneratorBasedBuilder._prepare_split.<locals>.<listcomp>  sP       &FJ %j 1 1   6"   rP   T@   r  max_workersc                     g | ]	}|D ]}|
S rO   rO   rk   shard_lengthsshard_lengths      rQ   r  z8GeneratorBasedBuilder._prepare_split.<locals>.<listcomp>  :     8 8 8!.\i8 8LX8 8 8 8rP   r   r   r  r  r  r6  rO   'r@   r   MAX_SHARD_SIZEr&   r   r   r   r   r   r   rf  r\   r  r   rG   r  r   r   r7   tqdmr  num_examples_prepare_split_singler   	enumeraterH   r   r   rB   re  rI  r  r
   r~   r   r$  r'  r   r   )#rd   r  r  r.  r0  r/  r  r   r  SUFFIXfnamenum_input_shardspbarresultr  r  donecontentexamples_per_jobbytes_per_jobfeatures_per_jobshard_lengths_per_jobkwargs_per_jobnum_jobspooltotal_num_examplestotal_num_bytesr   r  shards_and_jobsr  r  r  r  r  s#   `                              @@@@rQ   r  z$GeneratorBasedBuilder._prepare_split  s    2.2YFDYZZ+DH555$,@BGLL).	9')/*>?JJ(3J(9KK3KVKKkKK	$*E22 	,1>?YZZ1$$)= \X  \  \*/  \  \  \   %*:X*E*E ]X  ]  ];K  ]  ]V`Ve  ]  ]  CS  ]  ]  ]   ,|7999)6z666
 
 
 &,$$8
 
 x1}}F(3JF - --GT-G .)&. .<O. . - -)FD'  -!(G,,,,-- - - - - - - - - - - - - - - %%'V%%%h h#)h h hdm-=~OdOd   *3%o&@xXXX+ +  N >**H $v0!FX-M $v0"Vh.N%)FX$5!h 14 1 11Cd8.2 2 2 1 1-g   1 !( 0 8 -f 5 0 8 .v 6 5f = = !KK000011 1 1 1 1 1 1 1 1 1 1 1 1 1 11 1 1 1 1 1 1 1 1 1 1 1 1 1 1& ,,,, gM]  g  g  g -,, >** !122m,,#A&2D"//>", 	7777888!U3Z          *3N*C*C  O
 }otQSTTTT8 82G8 8 8O&44
  $HfLLg('8'899AA'f??[[fb))  
 9%!)DI &%sI   2GGGKAJ4(K4J8	8K;J8	<KKKr  r  r.  r  r  rv   c           
   #     K    | j         di |}|dk    rt          nt          }	|dk    }
g }d\  }}d}d}	  |	| j        j        |                    d|d                              d|d          | j        |j        || j        j	        |
          }	 t          j
                    }|D ]A\  }}||j        |k    r|                                \  }}|                                 |                    |           ||z  }||z  }|dz  } |	|j        |                    d|d                              d|d          | j        |j        || j        j	        |
          }| j        j        | j        j                            |          n|}|                    ||           |dz  }t          j
                    |t$          j        z   k    rt          j
                    }|d	|fV  d}C	 |d	|fV  |dz   }|                                \  }}|                                 |                    |           ||z  }||z  }n[# |d	|fV  |dz   }|                                \  }}|                                 |                    |           ||z  }||z  }w xY wnE# t(          $ r8}t+          |t,                    r|j        |j        }t1          d
          |d }~ww xY w|d|||j        ||ffV  d S )Nr5  r'  r   r  r  r  )r   r   r   	hash_saltcheck_duplicatesr   embed_local_filesr   F.An error occurred while generating the datasetTrO   )r  r   r   r   r   r   r   r\   r   r   time
_num_bytesfinalizecloser  	_featuresencode_examplewriter   PBAR_REFRESH_TIME_INTERVAL	Exceptionrb   r   __context__rX   )rd   r  r  r.  r/  r  r  r  	generatorwriter_classrB  r$  r<  r=  r  num_examples_progress_updatewriter_timer   recordr+  rI  exampler  r  s                            rQ   r,  z+GeneratorBasedBuilder._prepare_split_single*  s      ,D+99j99	(3y(@(@}}k'94.2+O'($0	b!\+]]7x,=,=>>FFwSYP_P_``"&"9$/!5 $ 8"3  F!-	#, 9 9KC%1f6G.6X6X28//2C2C/i%,,\:::*l:*'94 A!-%+%5!&w88I8I!J!J!R!RSZ_e\k\k!l!l.2.E&0o-A,0H,D.?" " " LP9K]Kidi0??GGGouGLL#...0A50y{{UV-N%NNN $	$e-IIIII784/92 e%AAAAA%\
*0//*;*;'i$$\222"l2"9, e%AAAAA%\
*0//*;*;'i$$\222"l2"9, 	b 	b 	b!122 "q}7PM()YZZ`aa		b d/&BRT^`mnnnnnnns3   AJ EI ,AJ AJJ 
K!)3KK!c                 ~     t                      j        ||fd|t          j        k    p|t          j        k    i| d S )Nr  )superrd  r;   rP  rL  )rd   r   r,  prepare_splits_kwargs	__class__s       rQ   rd  z+GeneratorBasedBuilder._download_and_preparep  sb    %%	
 	
 "36F6S!S "@ $4$??		

 $	
 	
 	
 	
 	
rP   c                 6    t          | j        |j                  S ri   )r,   r  r  r  s     rQ   r  z6GeneratorBasedBuilder._get_examples_iterable_for_splity  s     79STTTrP   r  )rL   rM   rN   r   r  r  r  r4   r}   r	   r~   r   r|   r  r   r5   r   r
   tupler,  rd  r,   r  __classcell__rX  s   @rQ   r  r  p  s         	$ $ $D "&48P* P*'P* #P*
 3-P* !sCx1P* P* P* P*dDoDo Do 	Do
 Do Do #Do Do 
%T5e#445	6Do Do Do DoL
 
 
 
 
U USc U U U U U U U UrP   r  c                       e Zd ZdZej        d             Z	 	 	 ddedede	e
         de	eee
f                  fd	Zd
ededede
de
deee
eee
ef         f                  fdZdedefdZdS )ArrowBasedBuilderzaBase class for datasets with data generation based on Arrow loading functions (CSV/JSON/Parquet).c                     t                      )a  Default function generating examples for each `SplitGenerator`.

        This function preprocess the examples from the raw data to the preprocessed
        dataset files.
        This function is called once for each `SplitGenerator` defined in
        `_split_generators`. The examples yielded here will be written on
        disk.

        Args:
            **kwargs (additional keyword arguments):
                Arguments forwarded from the SplitGenerator.gen_kwargs

        Yields:
            key: `str` or `int`, a unique deterministic example identification key.
                * Unique: An error will be raised if two examples are yield with the
                    same key.
                * Deterministic: When generating the dataset twice, the same example
                    should have the same key.
                Good keys can be the image id, or line number if examples are extracted
                from a text file.
                The key will be hashed and sorted to shuffle examples deterministically,
                such as generating the dataset multiple times keep examples in the
                same order.
            example: `pyarrow.Table`, a feature table
                ready to be encoded and written to disk.
        r  r  s     rQ   _generate_tablesz"ArrowBasedBuilder._generate_tables  s    8 "###rP   r(  Nr  r.  r0  r/  c                 X    ! t          |pt          j                  }t           j                   }|rt
          j        j        nt          j        } j	        j
         j	        j
        |j                 }n|j        }d} j         d|j         | d| }	 | j        |	          |r|dk    r{t          |j                  }
|
dk    r+|)t                               d| d|j         d           d}n6|4|
|k     r.t           	                    d| d|
 d	|j         d
|
 d	           |
}t%          j        t%          j                     d|j        dd|j         d          }||d||dk    rvd }|j        }d}|5    j        d(||dD ] \  }}}|r|}|                    |           !	 d d d            n# 1 swxY w Y   |
J d            d |D             \  }}} }nfdt1          t3          |j        |                    D             }t5          |          }d g|z  }d g|z  }d g|z  }d g|z   d g|z  }t7          |          5 }|5  t9          | j        |          D ]5\  }}}|r|\  ||<   ||<   ||<    |<   ||<    |                    |           6	 d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   d |vsJ d| d            t;                     !t;          |          }t;          |          }|d         }||j        _        ||j        _        t                               d! d           !dk    r^dt@          tB                   f  !fd}d t1                     D             }tE          ||dd !           d" |D             |j        _#        n[d#\  }} $                    %                    d$|d%          %                    d&|d%          %                    |d'                      j	        j&        | j	        _&        d S d S ))Nr  r   r   r   r  r  r  rG  r  r  r  r  Fr  r  r  )r  r.  r/  r   r  r  c                     g | ]}|gS rO   rO   r  s     rQ   r  z4ArrowBasedBuilder._prepare_split.<locals>.<listcomp>  r   rP   c                 $    g | ]\  }}||d S r  rO   r  s      rQ   r  z4ArrowBasedBuilder._prepare_split.<locals>.<listcomp>  r  rP   r  r  r
  r  r  shard_id_and_jobc                 "   | \  }}t          d |                   |z   }                                        d|d                              d|d                              d|d                              dd                     d S r  r  )rd  r  r  r  r  rd   r  r  s       rQ   r  z7ArrowBasedBuilder._prepare_split.<locals>._rename_shard  s    #3 &"%nWfW&=">">"IMM'h+<+<==EEgRX__MM-O1I1IJJRRSZ_k\q\qrr    rP   c                 @    g | ]\  }}t          |          D ]}||fS rO   r  r  s       rQ   r  z4ArrowBasedBuilder._prepare_split.<locals>.<listcomp>  sP     " " "&FJ %j 1 1" "  6"" " " "rP   Tr  r   c                     g | ]	}|D ]}|
S rO   rO   r#  s      rQ   r  z4ArrowBasedBuilder._prepare_split.<locals>.<listcomp>  r&  rP   r'  r  r  r  r6  rO   r(  )"rd   r  r.  r0  r/  r  r   r  r.  r/  r0  r1  r2  r  r  r3  r4  r5  r6  r7  r8  r9  r:  r;  r<  r=  r   r  shard_ids_and_jobsr  r  r  r  r  s"   `                             @@@@rQ   r  z ArrowBasedBuilder._prepare_split  s    2.2YFDYZZ+DH555$,@BGLL).	9')/*>?JJ(3J(9KK3KVKKkKK	$*E22 	,1>?YZZ1$$)= \X  \  \*/  \  \  \   %*:X*E*E ]X  ]  ];K  ]  ]V`Ve  ]  ]  CS  ]  ]  ]   ,|7999)6z666
 
 
 &,
 
 x1}}F(3JF - --GT-G .)&. .<O. . - -)FD'  -!(G,,,,-- - - - - - - - - - - - - - - %%'V%%%h h#)h h hdm-=~OdOd   *3%o&@xXXX+ +  N >**H $v0!FX-M $v0"Vh.N%)FX$5!h 14 1 11Cd8.2 2 2 1 1-g   1 !( 0 8 -f 5 0 8 .v 6 5f = = !KK000011 1 1 1 1 1 1 1 1 1 1 1 1 1 11 1 1 1 1 1 1 1 1 1 1 1 1 1 1& ,,,, gM]  g  g  g -,, >** !122m,,#A&2D"//>", 	7777888!c
         " "*3N*C*C" " "
 }&8$TVWWWW8 82G8 8 8O&44
  $HfLLg('8'899AA'f??[[fb))  
 9%!)DI &%sI    2F??GGK	AJ2&K	2J6	6K	9J6	:K		KKr  r  r  rv   c           	   #     K    | j         di |}|dk    rt          nt          }|dk    }g }	d\  }
}d}d}	  || j        j        |                    d|d                              d|d          | j        | j        j        |          }	 t          j	                    }|D ]\  }}||j
        |k    r|                                \  }}|                                 |	                    |           |
|z  }
||z  }|dz  } ||j        |                    d|d                              d|d          | j        | j        j        |          }|                    |           |t!          |          z  }t          j	                    |t"          j        z   k    rt          j	                    }|d	|fV  d}	 |d	|fV  |dz   }|                                \  }}|                                 |	                    |           |
|z  }
||z  }n[# |d	|fV  |dz   }|                                \  }}|                                 |	                    |           |
|z  }
||z  }w xY wnE# t&          $ r8}t)          |t*                    r|j        |j        }t/          d
          |d }~ww xY w|d|
||j        ||	ffV  d S )Nr5  r'  r   r  r  r  )r   r   r   r   rB  r   FrC  TrO   )r`  r   r   r   r   r   r   r   r   rD  rE  rF  rG  r  rH  write_tabler   r   rK  rL  rb   r   rM  rX   )rd   r  r  r.  r/  r  rN  rO  rB  r$  r<  r=  r  rP  rQ  rR  r   tabler+  rI  r  r  s                         rQ   r,  z'ArrowBasedBuilder._prepare_split_single-  sm      *D)77J77	(3y(@(@}}k'94.2+O'($+	b!\+]]7x,=,=>>FFwSYP_P_``"&"9 $ 8"3  F-	 ) 9 9HAu%1f6G.6X6X28//2C2C/i%,,\:::*l:*'94 A!-%+%5!&w88I8I!J!J!R!RSZ_e\k\k!l!l.2.E,0H,D.?" " " &&u---0CJJ>0y{{UV-N%NNN $	$e-IIIII784)9, e%AAAAA%\
*0//*;*;'i$$\222"l2"9, e%AAAAA%\
*0//*;*;'i$$\222"l2"9, 	b 	b 	b!122 "q}7PM()YZZ`aa		b d/&BRT^`mnnnnnnns3   AI0 D0H =AI0 AI,,I0 0
J2:3J--J2c                 R    t          t          | j                  |j                  S )N)r  )r,   r.   r`  r  r  s     rQ   r  z2ArrowBasedBuilder._get_examples_iterable_for_splitg  s-    243HIIRaRl
 
 
 	
rP   r  )rL   rM   rN   r   r  r  r`  r4   r|   r	   r~   r   r  r   r   r
   r}   rZ  r,  r,   r  rO   rP   rQ   r^  r^  }  s)       kk$ $ $@ #"&48M* M*'M* M* 3-	M*
 !sCx1M* M* M* M*^8o8o'*8o9<8oNQ8o[^8o	%T5e#445	68o 8o 8o 8ot
 
Sc 
 
 
 
 
 
rP   r^  c                       e Zd ZdS )MissingBeamOptionsNrK   rO   rP   rQ   rn  rn  m  rR   rP   rn  c                       e Zd ZdZddd fd
Zd Zej        d             Z fdZ	d Z
	 dd
eeeef                  fdZ	 ddee         deeeef         ef         fdZdedefdZdefdZd Zed             Z xZS )BeamBasedBuilderzBeam-based Builder.N)beam_runnerbeam_optionsc                d    || _         || _        i | _         t                      j        |i | d S ri   )_beam_runner_beam_options_beam_writersrV  r   )rd   rq  rr  argsr  rX  s        rQ   r   zBeamBasedBuilder.__init__t  s=    ')$)&)))))rP   c                     i }t          j        | j                  j                                        }d|v r|d         |d<   |S )Npipeline)r   r   r  r   rq   )rd   rq  r  split_generators_arg_namess       rQ   r  z.BeamBasedBuilder._make_split_generators_kwargsz  sO     #%%,%6t7M%N%N%Y%^%^%`%`"3332Fz2R#J/&&rP   c                     t                      )aB  Build the beam pipeline examples for each `SplitGenerator`.

        This function extracts examples from the raw data with parallel transforms
        in a Beam pipeline. It is called once for each `SplitGenerator` defined in
        `_split_generators`. The examples from the PCollection will be
        encoded and written to disk.

        <Tip warning={true}>
        Warning: When running in a distributed setup, make sure that the data
        which will be read (download_dir, manual_dir,...) and written (cache_dir)
        can be accessed by the workers jobs. The data should be located in a
        shared filesystem, like GCS.
        </Tip>

        Args:
            pipeline ([`utils.beam_utils.BeamPipeline`]):
                Apache Beam pipeline.
            **kwargs (additional keyword arguments):
                Arguments forwarded from the SplitGenerator.gen_kwargs.

        Returns:
            `beam.PCollection`: Apache Beam PCollection containing the
                example to send to `self.info.features.encode_example(...)`.

        Example:

        ```
        def _build_pcollection(pipeline, extracted_dir=None):
            return (
                    pipeline
                    | beam.Create(gfile.io.listdir(extracted_dir))
                    | beam.Map(_process_file)
            )
        ```
        r  )rd   ry  r  s      rQ   _build_pcollectionz#BeamBasedBuilder._build_pcollection  s    J "###rP   c                 n   dd l }dd lmc m} | j        }| j        }|s-|s+d| j         d| j        j         d}t          d| d          | j	        t                              d           dd	i}	d
|v r3|                    d
          }
|
|	d<   |
|	d<   d|	d<   t          d          |p#|j        j        j                            |	          }|                    ||          } t'                      j        |ft*          j        |d| |                                }|                                 |                                }| j        j        }| j                                        D ]F\  }}|j                                                            |          }|                     |!                    |                    \  }}||         }||_"        ||_#        tI          |d          r%tK          |j&                  dk    r|j&        |_'        |(                    dd          }| j         d| d| }| j         d| d| }tS          | j*                  stV          j,        j-        nt\          j-        } || j/        |          } || j/        |          }| 0                    ||           Hd S )Nr   r   r   z', beam_runner='DirectRunner')a*  Trying to generate a dataset using Apache Beam, yet no Beam Runner or PipelineOptions() has been provided in `load_dataset` or in the builder arguments. For big datasets it has to run on large-scale data processing tools like Dataflow, Spark, etc. More information about Apache Beam runners at https://beam.apache.org/documentation/runners/capability-matrix/
If you really want to run it locally because you feel like the Dataset is small enough, you can use the local beam runner called `DirectRunner` (you may run out of memory). 
Example of usage: 
	`r   zf`writer_batch_size` is not supported for beam pipelines yet. Using the default chunk size for writing.pipeline_type_checkFr0  direct_num_workersnum_workersmulti_processingdirect_running_modezNUsing a DirectRunner with `num_proc` for multiprocessing it not supported yet.)runneroptions)r,  ry  )r  _shard_lengthsr   r.  r(  r   z-00000-of-00001.r   )1apache_beamdatasets.utils.beam_utilsr   
beam_utilsrt  ru  r\   r   rn  r   r   r   r   r  r  pipeline_optionsPipelineOptionsfrom_dictionaryBeamPipelinerV  rd  r;   rK  runwait_until_finishmetricsr   rf  rv  r   MetricsFilterwith_namespacerF  queryr+  rI  r   r   r  r$  r   r&   r   r   r   r   r   r   r'  )rd   r   r,  rW  beamr  rq  rr  usage_exampler  r  ry  pipeline_resultsr  r  
split_namebeam_writerm_filterr+  rI  r  r.  	src_fname	dst_fnamer   	src_fpath	dst_fpathrX  s                              rQ   rd  z&BeamBasedBuilder._download_and_prepare  sC   """"666666666') 	< 	lTYllDK<LlllM$	) &	) 	) 	)   ".NNx   259.../33J??K5@12.9]+6H23%&vwww#vt|'D'T'd'deu'v'v**  + 
 
 	&%	
*:*Dx	
 	
[p	
 	
 	
 $<<>>**,,,"**,,Y%
'+'9'?'?'A'A 	3 	3#J|1133BBZBXXH&1&:&:7==;R;R&S&S#L)#J/J&2J##,J {$455 3#k>X:Y:Y\]:]:]+6+E
(( 477wOO#yTT:TT{TT	#yEE:EEEE	0DTX0N0NbBGLLT]Tb	%Id&6	BB	%Id&6	BB	Y	2222#	3 	3rP   c                 0   dd l }|j        j        j        }t	          | j                  st          j        j        nt          j        }|
                     || j        t          j                            5 }| j                            |           d d d            n# 1 swxY w Y   | j        j        rc|
                     || j        t          j                            5 }| j                            |           d d d            d S # 1 swxY w Y   d S d S )Nr   )r  iofilesystemsFileSystemsr&   r   r   r   r   r   creater   r   r   r   
_dump_infolicenseLICENSE_FILENAME_dump_license)rd   r  fsr   fs        rQ   ri  zBeamBasedBuilder._save_info  sy   """"W ,(<TX(F(FZBGLLIN	YYyy!163OPPQQ 	$UVI  ###	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$9 	+99T%5v7NOOPP +TU	''***+ + + + + + + + + + + + + + + + + +	+ 	+s$   5BB #B !D		DDr(  r/  c                     dd l |t          d          j        j        } j         d| d| }t	           j                  st          j        j        nt          j        } | j
        |          }t           j        j        || j
                   j        |<    j        j        j        j         fd            }	|| |	            z	  z  }
d S )Nr   zmax_shard_size is not supported for Beam datasets.Please set it to None to use the default Apache Beam sharding and get the best performance.r   r   )r   r   r  r   c                      j         | fi j        }|d                    fd          z	  z  }                    |          S )z+PTransformation which build a single split.Encodec                 6    | d          | d                   fS )Nr   r   rO   )key_exrI  s    rQ   <lambda>zMBeamBasedBuilder._prepare_split.<locals>._build_pcollection.<locals>.<lambda>  s!    6!9nn]cde]fNgNgBh rP   )r|  r  Mapwrite_from_pcollection)ry  pcoll_examplesr  r  rI  rd   r  s     rQ   r|  z;BeamBasedBuilder._prepare_split.<locals>._build_pcollection  s[     5T4X\\A[\\Nh$((3h3h3h3h*i*iiiN55nEEErP   )r  r  r  r\   r&   r   r   r   r   r   r   r   r   r   rv  rI  ptransform_fn)rd   r  ry  r.  r/  r  r/  r   r  r|  r   r  r  rI  s   ``         @@@rQ   r  zBeamBasedBuilder._prepare_split  s6    	#"""%%n   %/4
999z99K99(<TX(F(FZBGLLIN		$*E22 Y'ezUYUe
 
 
 *5:&+: 
		F 	F 	F 	F 	F 	F 	F 	F 
		F z%7%7%9%999rP   r   rv   c           	      D                                        fd j        j                                        D             }|r:	 ||         }n0# t          $ r# t          d| dt          |                     w xY wt          |t                    rt          |          }|S )Nc                 z    i | ]7}|j         t                              |          j        |j                    8S ))r   r   )r\   r-   r  r   )rk   r   rd   s     rQ   rz   z9BeamBasedBuilder.as_streaming_dataset.<locals>.<dictcomp>(  sR     
 
 
 J(M(Me(T(T[_[dlqlvwww
 
 
rP   r  r  )
_request_info_from_hf_gcsr   rf  r   KeyErrorrc   r   rb   r   r   )rd   r   r  s   `  rQ   r  z%BeamBasedBuilder.as_streaming_dataset#  s     	&&(((
 
 
 
)0022
 
 
  	\\#E? \ \ \ !Zu!Z!Z$x..!Z!Z[[[\h%% 	5*844Hs   A -A9c                 0    t          | j        d|i          S )Nr   )r,   _generate_examples_from_hf_gcsr  s     rQ   r  z1BeamBasedBuilder._get_examples_iterable_for_split5  s     CguEUVVVrP   c              #      	K   j         r2t          j                   		 fdt          	          D             }n j         d j         dj         dg}d}|D ]}t          |d          5 }t          j                            |          5 }|D ]$}|	                                D ]}||fV  |dz  }%	 d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   d S )Nc                 T    g | ]$}j          d j         dj         d|dddd
%S )r   r   r  z-of-.arrow)_remote_cache_dir_from_hf_gcsr\   )rk   r  r  rd   r   s     rQ   r  zCBeamBasedBuilder._generate_examples_from_hf_gcs.<locals>.<listcomp>;  sa     $ $ $ 5yy	yyEJyyQYyyycmyyyy$ $ $rP   r   r   r  r   rbr   )
r$  r   r  r  r\   r$   paipcopen_stream	to_pylist)
rd   r   remote_prepared_urlsr   remote_prepared_urlr  r}  record_batchrS  r  s
   ``       @rQ   r  z/BeamBasedBuilder._generate_examples_from_hf_gcs8  s      	lU011J$ $ $ $ $ $ %j 1 1$ $ $  
 (,'I$j$jDI$j$jX]Xb$j$j$j#k #7 	% 	%*D11 %QV''** %f(. % %&2&<&<&>&> % %F"%v+---1HCC%%% % % % % % % % % % % % % % %% % % % % % % % % % % % % % %	% 	%s6   0 C(C9CC		CC	CC 	#C 	c                 V   ddl m} | j         dt          j         }	  ||          5 }dd l}|                    |          }d d d            n# 1 swxY w Y   n"# t          $ r}t          |          d d }~ww xY w| j	        
                    t          j        |                     d S )Nr   )r$   r   r   )#download.streaming_download_managerr$   r  r   r   jsonloadr  r   r   r   r)   	from_dict)rd   r$   remote_dataset_infor  r  r   errs          rQ   r  z*BeamBasedBuilder._request_info_from_hf_gcsJ  s   >>>>>>!%!CddfFbdd	8*++ %q		!% % % % % % % % % % % % % % % ! 	8 	8 	8(--47	8	.u5566666s:   A AA AA AA 
A:%A55A:c                     |                      d          }t          dz   |                    t          j        d          z   S )NF)r  r   )r  r   r   r   r   )rd   r|  s     rQ   r  z.BeamBasedBuilder._remote_cache_dir_from_hf_gcsW  s<     33e3DD$'8'@'@'M'MMMrP   )r(  Nri   )rL   rM   rN   r   r   r  r  r  r|  rd  ri  r	   r   r|   r~   r  r   r-   r  r5   r,   r  r  r  r  r  r[  r\  s   @rQ   rp  rp  q  s       *.T * * * * * * *' ' ' 	$$ $$ $$LG3 G3 G3 G3 G3R	+ 	+ 	+ ko": ":NVW\]`be]eWfNg": ": ": ":L  $ } 
tC()?:	;   $Wi WDT W W W W%I % % % %$7 7 7 N N XN N N N NrP   rp  )r   r  rV  r   r   r   r   rB  ru  rD  r   r   dataclassesr   	functoolsr   pathlibr   typingr   r   r   r	   r
   r   r   pyarrowr  multiprocessr   tqdm.contrib.concurrentr   r6  r   r   arrow_datasetr   arrow_readerr   r   r   r   r   arrow_writerr   r   r   r   r_   r   r   dataset_dictr   r   download.download_configr   download.download_managerr    r!   download.mock_download_managerr"   r  r#   r$   r   r%   r  r&   r'   r  r(   r   r)   r*   r+   iterable_datasetr,   r-   r.   keyhashr/   namingr0   r1   rf  r2   r3   r4   r5   	streamingr6   r7   utils.file_utilsr8   r9   utils.filelockr:   utils.info_utilsr;   r<   r=   r>   utils.py_utilsr?   r@   rA   rB   rC   rD   rE   rF   utils.shardingrG   rH   
get_loggerrL   r   rc   rJ   rL  rT   rV   rX   rZ   r   r  r^  rn  rp  rO   rP   rQ   <module>r     s    !   



       				          ! ! ! ! ! !             B B B B B B B B B B B B B B B B            . . . . . .         " " " " " "              W V V V V V V V V V V V 8 8 8 8 8 8 8 8 : : : : : : : : 4 4 4 4 4 4 D D D D D D D D ? ? ? ? ? ? P P P P P P P P                     B B B B B B B B B B g g g g g g g g g g ( ( ( ( ( ( N N N N N N N N ? ? ? ? ? ? ? ? ? ? ? ? ; ; ; ; ; ;       8 8 8 8 8 8 8 8 $ $ $ $ $ $ g g g g g g g g g g g g	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 O N N N N N N N 
	H	%	%	 	 	 	 	
 	 	 		 	 	 	 		 	 	 		 	 	 	 	+ 	 	 		 	 	 	 	. 	 	 	 ^ ^ ^ ^ ^ ^ ^ ^Bd$ d$ d$ d$ d$ d$ d$ d$N%JU JU JU JU JUN JU JU JUZm
 m
 m
 m
 m
 m
 m
 m
`	 	 	 	 	 	 	 	iN iN iN iN iN~ iN iN iN iN iNrP   