
    +gdV                        d Z ddlZddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+  e#e,          Z-g dZ.e/0                    d          de/0                    d          de/0                    d          de/0                    d          de/0                    d          de/0                    d          de/0                    d          de/0                    d          diZ1dd iZ2 e3d!  ee1e2          D                       Z4 G d" d#ej5                  Z6 G d$ d%e          Z7d&e8d'e8fd(Z9d'ee8         fd)Z:d&e8d'ee8         fd*Z; G d+ d,e          Z< G d- d.e<          Z= G d/ d0e<          Z> G d1 d2          Z?dS )3zDownload manager interface.    N)datetime)partial)chain)CallableDict	GeneratorIterableListOptionalTupleUnion   )config)DeprecatedEnum)cached_pathget_from_cachehash_url_to_filenameis_relative_pathurl_or_path_join)get_size_checksum_dict)
get_loggeris_progress_bar_enabledtqdm)NestedDataStructure
map_nestedsize_str   )DownloadConfig)txtcsvjsonjsonltsvconllconlluorigparquetpklpicklerelxml504B0304zip504B0506504B0708425A68bz21F8BgzipFD377A585A00xz04224D18lz428B52FFDzstds   Rar!rarc              #   4   K   | ]}t          |          V  d S N)len).0magic_numbers     Blib/python3.11/site-packages/datasets/download/download_manager.py	<genexpr>rA   F   s>                  c                       e Zd ZdZdZdZdZdS )DownloadModea)  `Enum` for how to treat pre-existing downloads and data.

    The default mode is `REUSE_DATASET_IF_EXISTS`, which will reuse both
    raw downloads and the prepared dataset if they exist.

    The generations modes:

    |                                     | Downloads | Dataset |
    |-------------------------------------|-----------|---------|
    | `REUSE_DATASET_IF_EXISTS` (default) | Reuse     | Reuse   |
    | `REUSE_CACHE_IF_EXISTS`             | Reuse     | Fresh   |
    | `FORCE_REDOWNLOAD`                  | Fresh     | Fresh   |

    reuse_dataset_if_existsreuse_cache_if_existsforce_redownloadN)__name__
__module____qualname____doc__REUSE_DATASET_IF_EXISTSREUSE_CACHE_IF_EXISTSFORCE_REDOWNLOAD rB   r@   rD   rD   L   s.          83)rB   rD   c                   0    e Zd ZdZdZdZed             ZdS )GenerateModerE   rF   rG   c                     dS )NzUse 'DownloadMode' instead.rO   selfs    r@   help_messagezGenerateMode.help_messagef   s    ,,rB   N)rH   rI   rJ   rL   rM   rN   propertyrU   rO   rB   r@   rQ   rQ   a   s?        73)- - X- - -rB   rQ   pathreturnc                 |    |                      d          d         }dD ]}|                     |          d         }|S )N.z?-_r   )split)rW   	extensionsymbs      r@   _get_path_extensionr_   k   sE    

3#I  - -OOD))!,		rB   c                    	 |                      d           n# t          t          j        f$ r Y dS w xY w|                     t
                    }|                      d           t          t
                    D ]q}t                              |dt
          |z
                     }||c S t                              |dt
          |z
                     }|t          d| d          rdS )zQread the magic number from a file-like object and return the compression protocolr   NzCompression protocol 'z' not implemented.)seekAttributeErrorioUnsupportedOperationreadMAGIC_NUMBER_MAX_LENGTHrange$MAGIC_NUMBER_TO_COMPRESSION_PROTOCOLget0MAGIC_NUMBER_TO_UNSUPPORTED_COMPRESSION_PROTOCOLNotImplementedError)fr?   icompressions       r@   *_get_extraction_protocol_with_magic_numberro   u   s
   	q				B34   tt66122LFF1III*++ ` `:>>|LiNehiNiLi?jkk"FJJ<XuZqtuZuXuKvww"%&^{&^&^&^___ #` `s    22c                     t          |           } t          |           }|t          v s|dv s|                     d          rd S t	          | d          5 }t          |          cd d d            S # 1 swxY w Y   d S )N)tgztar)z.tar.gzz.tar.bz2z.tar.xzrb)strr_   BASE_KNOWN_EXTENSIONSendswithopenro   )rW   r]   rl   s      r@   _get_extraction_protocolrx      s    t99D#D))I 	***&&==;<< ' t	dD		 =Q9!<<= = = = = = = = = = = = = = = = = =s   A//A36A3c                   $    e Zd ZdZdefdZd ZdS )_IterableFromGeneratorzkUtility class to create an iterable from a generator function, in order to reset the generator when needed.	generatorc                 0    || _         || _        || _        d S r<   r{   argskwargs)rT   r{   r~   r   s       r@   __init__z_IterableFromGenerator.__init__   s    "	rB   c              #   D   K    | j         | j        i | j        E d {V  d S r<   r}   rS   s    r@   __iter__z_IterableFromGenerator.__iter__   s9      !4>49<<<<<<<<<<<<rB   N)rH   rI   rJ   rK   r   r   r   rO   rB   r@   rz   rz      sD        uu(    
= = = = =rB   rz   c                       e Zd ZdZed             Zed             Zedee	ddf         fd            Z
ededee	ddf         fd            Zedd	            Zedd
            ZdS )ArchiveIterablezIAn iterable of (path, fileobj) from a TAR archive, used by `iter_archive`c              #   *  K   t          j        | d          }|D ]v}|j        }|                                s|!t          j                            |                              d          rT|                    |          }||fV  g |_	        w~d S )Nzr|*)fileobjmoderZ   __)
tarfilerw   nameisregosrW   basename
startswithextractfilemembers)rl   streamtarinfo	file_pathfile_objs        r@   	_iter_tarzArchiveIterable._iter_tar   s      ae444 	  	 GI==??  w	**55kBB ))'22HX%%%%FNNFFrB   c              #   :  K   t          j        |           }|                                D ]o}|j        }|                                r|!t
          j                            |                              d          rT|	                    |          }||fV  pd S )Nr   )
zipfileZipFileinfolistfilenameis_dirr   rW   r   r   rw   )rl   zipfmemberr   r   s        r@   	_iter_zipzArchiveIterable._iter_zip   s      q!!mmoo 
	& 
	&FI}}  w	**55kBB yy((HX%%%%%
	& 
	&rB   rX   Nc              #      K   t          |          }|dk    r|                     |          E d {V  d S |                     |          E d {V  d S )Nr-   )ro   r   r   )clsrl   rn   s      r@   _iter_from_fileobjz"ArchiveIterable._iter_from_fileobj   sr      @CC%}}Q'''''''''''}}Q'''''''''''rB   urlpathc              #      K   t          |          }t          |d          5 }|dk    r|                     |          E d {V  n|                     |          E d {V  d d d            d S # 1 swxY w Y   d S )Nrs   r-   )rx   rw   r   r   )r   r   rn   rl   s       r@   _iter_from_pathzArchiveIterable._iter_from_path   s      .w77'4   	,Ae##==++++++++++==+++++++++		, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	,s   >A--A14A1c                 $     | | j         |          S r<   )r   )r   r   s     r@   from_bufzArchiveIterable.from_buf   s    s3)7333rB   c                 $     | | j         |          S r<   )r   )r   urlpath_or_bufs     r@   	from_pathzArchiveIterable.from_path   s    s3&777rB   )rX   r   )rH   rI   rJ   rK   staticmethodr   r   classmethodr   r   r   rt   r   r   r   rO   rB   r@   r   r      s        SS  \  & & \& (itT0A&B ( ( ( [( ,c ,itT8I.J , , , [, 4 4 4 [4 8 8 8 [8 8 8rB   r   c                   x    e Zd ZdZedeeee         f         deeddf         fd            Z	edd            Z
dS )FilesIterablez8An iterable of paths from a list of directories or filesurlpathsrX   Nc              #   f  K   t          |t                    s|g}|D ]}t          j                            |          r:t          j                            |                              d          r d S |V  \t          j        |          D ]\  }}}t          d |D                       |d d <   t          j                            |                              d          rWt          |          D ]:}|                    d          rt          j        	                    ||          V  ;d S )Nr   c                 <    g | ]}|                     d           |S )r   )r   )r>   dirnames     r@   
<listcomp>z2FilesIterable._iter_from_paths.<locals>.<listcomp>   s-    )q)q)qgQXQcQcdoQpQp)q')q)q)qrB   )

isinstancelistr   rW   isfiler   r   walksortedjoin)r   r   r   dirpathdirnames	filenamesr   s          r@   _iter_from_pathszFilesIterable._iter_from_paths   sN     (D)) 	" zH 	> 	>Gw~~g&& >7##G,,77DD FF46GG4D4D > >0GXy #))q)q)q)q)q"r"rHQQQKw''00;;KHH ! $*9$5$5 > >#..{;; %$ gll7H======	>>	> 	>rB   c                 $     | | j         |          S r<   )r   )r   r   s     r@   
from_pathszFilesIterable.from_paths   s    s3'222rB   )rX   r   )rH   rI   rJ   rK   r   r   rt   r
   r   r   r   rO   rB   r@   r   r      s        BB>c49n(= >)CQUW[OB\ > > > [>. 3 3 3 [3 3 3rB   r   c            
       F   e Zd ZdZ	 	 	 	 	 ddee         dee         dee         dee         fdZed	             Z	ed
             Z
ed             ZdedefdZd Zd ZdededefdZdeeej        f         fdZdeeee         f         fdZddZd Zd Zd Zd ZdS ) DownloadManagerFNTdataset_namedata_dirdownload_config	base_pathc                     || _         || _        |pt          j                            d          | _        i | _        || _        |pt                      | _	        i | _
        i | _        dS )a4  Download manager constructor.

        Args:
            data_dir:
                can be used to specify a manual directory to get the files from.
            dataset_name (`str`):
                name of dataset this instance will be used for. If
                provided, downloads will contain which datasets they were used for.
            download_config (`DownloadConfig`):
                to specify the cache directory and other
                download options
            base_path (`str`):
                base path that is used when relative paths are used to
                download files. This can be a remote url.
            record_checksums (`bool`, defaults to `True`):
                Whether to record the checksums of the downloaded files. If None, the value is inferred from the builder.
        rZ   N)_dataset_name	_data_dirr   rW   abspath
_base_path_recorded_sizes_checksumsrecord_checksumsr   r   downloaded_pathsextracted_paths)rT   r   r   r   r   r   s         r@   r   zDownloadManager.__init__  sf    2 *!#;rws';';Z\& 0.B.2B2B "!rB   c                     | j         S r<   )r   rS   s    r@   
manual_dirzDownloadManager.manual_dir$  s
    ~rB   c                 b    t          d | j                                        D                       S )z+Returns the total size of downloaded files.c              3   &   K   | ]}|d          V  dS )	num_bytesNrO   )r>   checksums_dicts     r@   rA   z2DownloadManager.downloaded_size.<locals>.<genexpr>+  s'      mm>>+.mmmmmmrB   )sumr   valuesrS   s    r@   downloaded_sizezDownloadManager.downloaded_size(  s0     mmTEcEjEjElElmmmmmmrB   c                     ddl m |j                                                            d          t          d          fdt          fd| t                                 }|S )	a  Ship the files using Beam FileSystems to the pipeline temp dir.

        Args:
            downloaded_path_or_paths (`str` or `list[str]` or `dict[str, str]`):
                Nested structure containing the
                downloaded path(s).
            pipeline ([`utils.beam_utils.BeamPipeline`]):
                Apache Beam Pipeline.

        Returns:
            `str` or `list[str]` or `dict[str, str]`
        r   )upload_local_to_remotetemp_locationNzFYou need to specify 'temp_location' in PipelineOptions to upload filesc           
      6   t          j        t          j        t          j                            |                     }t                              d|  dt          t          j        
                    |                      d| d            | |           |S )Nz
Uploading z (z) to rZ   )	posixpathr   r   DOWNLOADED_DATASETS_DIRr   rW   r   loggerinfor   getsize)local_file_pathremote_file_path
remote_dirr   s     r@   uploadz8DownloadManager.ship_files_with_pipeline.<locals>.uploadA  s    (~F:BG<L<L_<]<]    KKt_tt9Y9Y0Z0Zttaqttt   #"?4DEEE##rB   c                      |           S r<   rO   )r   r   s    r@   <lambda>z:DownloadManager.ship_files_with_pipeline.<locals>.<lambda>L  s    FF?$;$; rB   disable_tqdm)utils.beam_utilsr   _optionsget_all_optionsri   
ValueErrorr   r   )downloaded_path_or_pathspipelineuploaded_path_or_pathsr   r   r   s      @@@r@   ship_files_with_pipelinez(DownloadManager.ship_files_with_pipeline-  s     	>=====&6688<<_MM
efff	$ 	$ 	$ 	$ 	$ 	$ ",;;;;$4666"
 "
 "

 &%rB   url_or_urlsr   c           	      *   d}t          t          t          |                                |                                                    |dt	                                 D ]0\  }}t          || j                  | j        t          |          <   1dS )z)Record size/checksum of downloaded files.   zComputing checksums)delaydescdisable)record_checksumN)	r   r   r-   flattenr   r   r   r   rt   )rT   r   r   r   urlrW   s         r@   _record_sizes_checksumsz'DownloadManager._record_sizes_checksumsR  s    [((**,D,L,L,N,NOOPP&/111	
 
 
 		 		IC 8Nd&;8 8 8D*3s8844		 		rB   c                 .  	 | j         j        pt          j        	| j         j        }	fd}t          ||t                                 }t          |          }t          |          }t          |	                                |	                                          D ]]\  }}	 t          |	dd|           d}n# t          $ r d}Y nw xY w|r| j         j        r  |||           t          |	dd|           ^|                     ||           |j        S )a  
        Download given urls(s) by calling `custom_download`.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download and extract. Each URL is a `str`.
            custom_download (`Callable[src_url, dst_path]`):
                The source URL and destination path. For example
                `tf.io.gfile.copy`, that lets you download from  Google storage.

        Returns:
            downloaded_path(s): `str`, The downloaded paths matching the given input
                `url_or_urls`.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download_custom('s3://my-bucket/data.zip', custom_download_for_my_private_bucket)
        ```
        c                 ^    t           j                            t          |                     S r<   )r   rW   r   r   )r   	cache_dirs    r@   url_to_downloaded_pathz?DownloadManager.download_custom.<locals>.url_to_downloaded_pathx  s"    7<<	+?+D+DEEErB   r   TF)r   local_files_onlyuse_etagmax_retries)r   r   r   DOWNLOADED_DATASETS_PATHr  r   r   r   r-   r   r   FileNotFoundErrorforce_downloadr   data)
rT   r   custom_downloadr  r   r   r   rW   cachedr   s
            @r@   download_customzDownloadManager.download_custom`  s   * (2Uf6U	*6	F 	F 	F 	F 	F $."KBYB[B[>[$
 $
 $
  *+66#67O#P#P [00224L4T4T4V4VWW 	 	IC9teal    $    T1@ T***9teal    	$$[2JKKK',,s   "B99CCc           	      p   | j                                         }d|_        |j        d|_        t	          | j        |          }t          j                    }t          ||d|j	        t                       d          }t          j                    |z
  }t                              d|                                d	z   d
           t          |          }t          |          }| j                            t#          t%          |                                |                                                               t          j                    }|                     ||           t          j                    |z
  }t                              d|                                d	z   d
           |j        S )ay  Download given URL(s).

        By default, only one process is used for download. Pass customized `download_config.num_proc` to change this behavior.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download. Each URL is a `str`.

        Returns:
            `str` or `list` or `dict`:
                The downloaded paths matching the given input `url_or_urls`.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        ```
        FNDownloading datar   TzDownloading data files)	map_tuplenum_procr   r   zDownloading took <   z minzChecksum Computation took )r   copyextract_compressed_filedownload_descr   	_downloadr   nowr   r  r   r   r   total_secondsr   r   updatedictr-   r   r   r  )rT   r   r   download_func
start_timer   durations          r@   downloadzDownloadManager.download  s   & .335527/(0,>O)PPP\^^
#-$-4666)$
 $
 $
  <>>J.L(>(>(@(@B(FLLLMMM)+66#67O#P#P $$T#k.A.A.C.CE]EeEeEgEg*h*h%i%ijjj\^^
$$[2JKKK<>>J.U1G1G1I1IR1OUUUVVV',,rB   url_or_filenamerX   c                     t          |          }t          |          rt          | j        |          }t	          ||          S )Nr  )rt   r   r   r   r   )rT   r  r   s      r@   r  zDownloadManager._download  sD    o..O,, 	Q.tPPO?OLLLLrB   path_or_bufc                     t          |d          rt                              |          S t                              |          S )aK  Iterate over files within an archive.

        Args:
            path_or_buf (`str` or `io.BufferedReader`):
                Archive path or archive binary file object.

        Yields:
            `tuple[str, io.BufferedReader]`:
                2-tuple (path_within_archive, file_object).
                File object is opened in binary mode.

        Example:

        ```py
        >>> archive = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> files = dl_manager.iter_archive(archive)
        ```
        re   )hasattrr   r   r   )rT   r  s     r@   iter_archivezDownloadManager.iter_archive  s>    ( ;'' 	:"++K888",,[999rB   pathsc                 6    t                               |          S )a  Iterate over file paths.

        Args:
            paths (`str` or `list` of `str`):
                Root paths.

        Yields:
            `str`: File path.

        Example:

        ```py
        >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/beans/resolve/main/data/train.zip')
        >>> files = dl_manager.iter_files(files)
        ```
        )r   r   )rT   r!  s     r@   
iter_fileszDownloadManager.iter_files  s    " ''...rB   
deprecatedc           	         |dk    rt          j        dt                     | j                                        }d|_        |j        d|_        t          t          t          |          ||j
        t                       d          }t          |          }t          |          }| j                            t          t!          |                                |                                                               |j        S )	ak  Extract given path(s).

        Args:
            path_or_paths (path or `list` or `dict`):
                Path of file to extract. Each path is a `str`.
            num_proc (`int`):
                Use multi-processing if `num_proc` > 1 and the length of
                `path_or_paths` is larger than `num_proc`.

                <Deprecated version="2.6.2">

                Pass `DownloadConfig(num_proc=<num_proc>)` to the initializer instead.

                </Deprecated>

        Returns:
            extracted_path(s): `str`, The extracted paths matching the given input
            path_or_paths.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> extracted_files = dl_manager.extract(downloaded_files)
        ```
        r$  z'num_proc' was deprecated in version 2.6.2 and will be removed in 3.0.0. Pass `DownloadConfig(num_proc=<num_proc>)` to the initializer instead.TNr
  r  zExtracting data files)r  r   r   )warningswarnFutureWarningr   r  r  r  r   r   r   r  r   r   r   r  r  r-   r   r  )rT   path_or_pathsr  r   r   s        r@   extractzDownloadManager.extract  s    6 |##M b   .335526/(0,>O)$KAAA$-4666(
 
 
 ,M::-o>>##D]-B-B-D-DoF]F]F_F_)`)`$a$abbb##rB   c                 R    |                      |                     |                    S )a  Download and extract given `url_or_urls`.

        Is roughly equivalent to:

        ```
        extracted_paths = dl_manager.extract(dl_manager.download(url_or_urls))
        ```

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download and extract. Each URL is a `str`.

        Returns:
            extracted_path(s): `str`, extracted paths of given URL(s).
        )r*  r  )rT   r   s     r@   download_and_extractz$DownloadManager.download_and_extract$  s"      ||DMM+66777rB   c                 4    | j                                         S r<   )r   r  rS   s    r@   get_recorded_sizes_checksumsz,DownloadManager.get_recorded_sizes_checksums6  s    -22444rB   c                 v   t          | j                                                  t          | j                                                  z
  }t	          | j                                                  D ]D\  }}||v r;t          j                            |          rt          j	        |           | j        |= Ed S r<   )
setr   r   r   r   itemsr   rW   r   remove)rT   paths_to_deletekeyrW   s       r@   delete_extracted_filesz&DownloadManager.delete_extracted_files9  s    d299;;<<s4CXC_C_CaCa?b?bbd288::;; 	. 	.IC&&27>>$+?+?&	$(-	. 	.rB   c                 J    | j         j        r|                                  d S d S r<   )r   delete_extractedr5  rS   s    r@   manage_extracted_filesz&DownloadManager.manage_extracted_files@  s2    0 	*'')))))	* 	*rB   )NNNNT)r$  )rH   rI   rJ   is_streamingr   rt   r   r   rV   r   r   r   r   r   r   r  r  r  r   rc   BufferedReaderr   r
   r#  r*  r,  r.  r5  r8  rO   rB   r@   r   r      s       L '+"&48#'!" !"sm!" 3-!" ".1	!"
 C=!" !" !" !"F   X n n Xn "& "& \"&H3F bu    .- .- .-`.- .- .-`M M~ MRU M M M M:c23D.D(E : : : :2/c49n 5 / / / /&/$ /$ /$ /$b8 8 8$5 5 5. . .* * * * *rB   r   )@rK   enumrc   r   r   r   r&  r   r   	functoolsr   	itertoolsr   typingr   r   r   r	   r
   r   r   r    r   utils.deprecation_utilsr   utils.file_utilsr   r   r   r   r   utils.info_utilsr   utils.loggingr   r   r   utils.py_utilsr   r   r   r   r   rH   r   ru   bytesfromhexrh   rj   maxrf   EnumrD   rQ   rt   r_   ro   rx   rz   r   r   r   rO   rB   r@   <module>rI     s    " !  				 				                          T T T T T T T T T T T T T T T T T T T T       4 4 4 4 4 4 t t t t t t t t t t t t t t 5 5 5 5 5 5 E E E E E E E E E E F F F F F F F F F F + + + + + + 
H		     
MM*u	MM*u	MM*u	MM(U	MM&6	MM.!!4	MM*u	MM*v	( $ U4 0 #  BDtuu     * * * * *49 * * **- - - - -> - - -c c    `Xc] ` ` ` `$=3 =8C= = = = =	= 	= 	= 	= 	=X 	= 	= 	=:8 :8 :8 :8 :8, :8 :8 :8z3 3 3 3 3* 3 3 3@D* D* D* D* D* D* D* D* D* D*rB   