
    +gd[                        U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZ d dlZd dlZd dlZddlmZ ddlmZ ddlmZmZm Z m!Z! dd	l"m#Z# dd
l$m%Z% ddl&m'Z'm(Z( erddl)m*Z*  e%e+          Z,da-da.ed         e/d<   da0eej1                 e/d<    G d d          Z2d Z3d Z4d Z5d Z6 e#d          de7fd            Z8de7fdZ9de:fdZ;d Z< G d d          Z= e<ej!        e!ee e          d             Z> e<e          d              Z?i a@ee:e7f         e/d!<   de:fd"ZAd5de:fd$ZBd% ZCd5d&e:fd'ZDd6d(ed)ee:         de:fd*ZE	 	 	 d7d(ed,ed-ee:ef         d.eee:                  d/eee:                  d0e7dee:ef         fd1ZF	 	 	 	 	 d8d2e7d.eee:                  d/eee:                  d3eee:                  d0e7d)ee:         fd4ZGdS )9    N)wraps)Path)TYPE_CHECKINGAnyCallableDictListOptionalTupleUnion   )DatasetInfo)"INVALID_WINDOWS_CHARACTERS_IN_PATH)ConcatenationTableInMemoryTableMemoryMappedTableTable)
deprecated)
get_logger)asdictdumps)DatasetT_TempDirWithCustomCleanup_TEMP_DIR_FOR_TEMP_CACHE_FILES _DATASETS_WITH_TABLE_IN_TEMP_DIRc                   &    e Zd ZdZddZd Zd ZdS )r   z
    A temporary directory with a custom cleanup function.
    We need a custom temporary directory cleanup in order to delete the dataset objects that have
    cache files in the temporary directory before deleting the dorectory itself.
    Nc                     t          j                    | _        t          j        | | j                  | _        || _        || _        || _	        d S N)
tempfilemkdtempnameweakreffinalize_cleanup
_finalizer_cleanup_func_cleanup_func_args_cleanup_func_kwargs)selfcleanup_funccleanup_func_argscleanup_func_kwargss       4lib/python3.11/site-packages/datasets/fingerprint.py__init__z"_TempDirWithCustomCleanup.__init__8   sF    $&&	!*4??)"3$7!!!    c                      | j         | j        i | j         t          j                            | j                  rt          j        | j                   d S d S r   )	r&   r'   r(   ospathexistsr!   shutilrmtreer)   s    r-   r$   z"_TempDirWithCustomCleanup._cleanup?   sZ    D3Qt7PQQQ7>>$)$$ 	%M$)$$$$$	% 	%r/   c                 d    | j                                         r|                                  d S d S r   )r%   detachr$   r6   s    r-   cleanupz!_TempDirWithCustomCleanup.cleanupD   s3    ?!!## 	MMOOOOO	 	r/   r   )__name__
__module____qualname____doc__r.   r$   r9    r/   r-   r   r   1   sP         8 8 8 8% % %
    r/   c                     t           dS t          t          j                    at	          d | j        D                       rt                              |            dS dS )a  
    This function registers the datasets that have cache files in _TEMP_DIR_FOR_TEMP_CACHE_FILES in order
    to properly delete them before deleting the temporary directory.
    The temporary directory _TEMP_DIR_FOR_TEMP_CACHE_FILES is used when caching is disabled.
    Nc              3   |   K   | ]7}t          t          j                  t          |d                    j        v V  8dS )filenameN)r   r   r!   parents).0
cache_files     r-   	<genexpr>z?maybe_register_dataset_for_temp_dir_deletion.<locals>.<genexpr>U   sW         	+011T*Z:P5Q5Q5YY     r/   )r   r   r"   WeakSetanycache_filesadd)datasets    r-   ,maybe_register_dataset_for_temp_dir_deletionrK   I   s|     &- (/+2?+<+<(
  !-     6 	),,W55555	6 6r/   c                  <    t           t          t                     ng S r   )r   listr>   r/   r-   (get_datasets_with_cache_file_in_temp_dirrN   \   s    5U5a40111giir/   c                  
    da dS )  
    When applying transforms on a dataset, the data are stored in cache files.
    The caching mechanism allows to reload an existing cache file if it's already been computed.

    Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
    after each transform.

    If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
    More precisely, if the caching is disabled:
    - cache files are always recreated
    - cache files are written to a temporary directory that is deleted when session closes
    - cache files are named using a random hash instead of the dataset fingerprint
    - use [`~datasets.Dataset.save_to_disk`] to save a transformed dataset or it will be deleted when session closes
    - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
    the `download_mode` parameter in [`~datasets.load_dataset`].
    TN_CACHING_ENABLEDr>   r/   r-   enable_cachingrS   `   s    $ r/   c                  
    da dS )rP   FNrQ   r>   r/   r-   disable_cachingrU   u   s    $ r/   zUse datasets.enable_caching() or datasets.disable_caching() instead. This function will be removed in a future version of datasets.booleanc                 $    t          |           adS )a  
    When applying transforms on a dataset, the data are stored in cache files.
    The caching mechanism allows to reload an existing cache file if it's already been computed.

    Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
    after each transform.

    If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
    More precisely, if the caching is disabled:
    - cache files are always recreated
    - cache files are written to a temporary directory that is deleted when session closes
    - cache files are named using a random hash instead of the dataset fingerprint
    - use :func:`datasets.Dataset.save_to_disk` to save a transformed dataset or it will be deleted when session closes
    - caching doesn't affect :func:`datasets.load_dataset`. If you want to regenerate a dataset from scratch you should use
    the ``download_mode`` parameter in :func:`datasets.load_dataset`.
    NboolrR   )rV   s    r-   set_caching_enabledrZ      s    * G}}r/   returnc                  *    t          t                    S )a  
    When applying transforms on a dataset, the data are stored in cache files.
    The caching mechanism allows to reload an existing cache file if it's already been computed.

    Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
    after each transform.

    If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
    More precisely, if the caching is disabled:
    - cache files are always recreated
    - cache files are written to a temporary directory that is deleted when session closes
    - cache files are named using a random hash instead of the dataset fingerprint
    - use [`~datasets.Dataset.save_to_disk`]] to save a transformed dataset or it will be deleted when session closes
    - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
    the `download_mode` parameter in [`~datasets.load_dataset`].
    rX   r>   r/   r-   is_caching_enabledr]      s    $  !!!r/   c                  N    t           d } t          |           a t           j        S )z7Return a directory that is deleted when session closes.Nc                  P    t                      D ]} |                                  d S r   )rN   __del__)dsets    r-   r*   z9get_temporary_cache_files_directory.<locals>.cleanup_func   s0    @BB   r/   r*   )r   r   r!   rb   s    r-   #get_temporary_cache_files_directoryrc      s7     &-	 	 	 *CP\)])])]&)..r/   c                        fd}|S )Nc                 0    D ]}| t           j        |<   | S r   )Hasherdispatch)functtypess     r-   proxyzhashregister.<locals>.proxy   s&     	& 	&A!%FOAr/   r>   )rj   rk   s   ` r-   hashregisterrl      s#        
 Lr/   c                       e Zd ZU dZi Zeed<   d Zede	e
ee
         f         defd            Zededefd            Zededefd            Zdedd	fd
ZdefdZd	S )rf   z-Hasher that accepts python objects as inputs.rg   c                 6    t          j                    | _        d S r   )xxhashxxh64mr6   s    r-   r.   zHasher.__init__   s    r/   valuer[   c                     t          |t                    r|gn|}t          j                    }|D ]}|                    |           |                                S r   )
isinstancebytesro   rp   update	hexdigest)clsrr   rq   xs       r-   
hash_byteszHasher.hash_bytes   sU    %eU33>LNN 	 	AHHQKKKK{{}}r/   c                 F    |                      t          |                    S r   )rz   r   rx   rr   s     r-   hash_defaultzHasher.hash_default   s    ~~eEll+++r/   c                     t          |          | j        v r$ | j        t          |                   | |          S |                     |          S r   )typerg   r}   r|   s     r-   hashzHasher.hash   sH    ;;#,&&,3<U,S%888##E***r/   Nc                 
   dt          |           d}|                     |          }| j                            |                    d                     | j                            |                    d                     d S )Nz==utf8utf-8)r   r   rq   rv   encode)r)   rr   header_for_updatevalue_for_updates       r-   rv   zHasher.update   sw    0e00099U++'..v66777&--g6677777r/   c                 4    | j                                         S r   )rq   rw   r6   s    r-   rw   zHasher.hexdigest   s    v!!!r/   )r:   r;   r<   r=   rg   r   __annotations__r.   classmethodr   ru   r	   strrz   r   r}   r   rv   rw   r>   r/   r-   rf   rf      s        77Hd      uUDK%78 S    [ , , , , , [, + + + + + [+8C 8D 8 8 8 8"3 " " " " " "r/   rf   c                       fdd                     fdt          j                  D                                                                d                    S )Nc                     t          | t          j                  r$                    d | j        D                       S                     |                                                     d                    S )Nc              3   d   K   | ]+}|                                                     d           V  ,dS )r   N)	to_stringr   )rC   cs     r-   rE   z9_hash_pa_table.<locals>._hash_pa_array.<locals>.<genexpr>  s8      $Y$YqQ[[]]%9%9'%B%B$Y$Y$Y$Y$Y$Yr/   r   )rt   paChunkedArrayrz   chunksr   r   )rr   hashers    r-   _hash_pa_arrayz&_hash_pa_table.<locals>._hash_pa_array   si    eR_-- 	H$$$Y$YEL$Y$Y$YYYY$$U__%6%6%=%=g%F%FGGGr/   -c              3   F   K   | ]}|d z    |                   z   V  dS )r   Nr>   )rC   colr   rr   s     r-   rE   z!_hash_pa_table.<locals>.<genexpr>  s:      bbS3Yc
!;!;;bbbbbbr/   r   )joinsortedcolumn_namesrz   r   )r   rr   r   s   ``@r-   _hash_pa_tabler      sy    H H H H H HHbbbbbveN`GaGabbbbbEU\\'22333r/   c                     |                      t          j        t          |          d                              d                    S )NT)	sort_keysr   )rz   jsonr   r   r   )r   rr   s     r-   _hash_dataset_infor   	  s9    TZuFFFMMgVVWWWr/   fingerprint_warningsc                 j   | j         }t                      }t          |          D ]9}|dk    r	|                    |           |                    ||                    :| j        D ]:}|                    t
          j                            |d                              ;|                                S )N_fingerprintrA   )	__dict__rf   r   rv   rH   r1   r2   getmtimerw   )rJ   stater   keyrD   s        r-   generate_fingerprintr     s    EXXFe}} " ".  ceCj!!!!) @ @
bg&&z*'=>>????r/   @   c                 <    t          j        |           d| dz   dS )N0   ry   )randomgetrandbits)nbitss    r-   generate_random_fingerprintr   $  s(     ''6%(66666r/   c                    t                      }|                    |            	 |                    |           n#  t          rct                              dd          s)t
                              d| d           dt          d<   n=t
                              d| d           nt
                              d| d           t                      cY S xY wt          |          D ]}|                    |           	 |                    ||                    4#  t          r{t                              dd          s5t
                              d| d	||          d
| d           dt          d<   nUt
                              d| d	||          d
| d           n*t
                              d| d	||          d
| d           t                      cY c S xY w|
                                S )N(update_fingerprint_transform_hash_failedFz
Transform a   couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.Tz= couldn't be hashed properly, a random hash was used instead.zn couldn't be hashed properly, a random hash was used instead. This doesn't affect caching since it's disabled.zParameter 'z'=z of the transform )rf   rv   rR   r   getloggerwarninginfor   r   rw   )fingerprint	transformtransform_argsr   r   s        r-   update_fingerprintr   (  s   XXF
MM+-i    - 	'++,VX]^^ 	se e e e   TX$%OPPqqqqrrrrKK WY  W  W  W   +,,,,,n%% 1 1c	1MM.-....	1 +//0Z\abb NNic i i^C-@ i iT] i i i   X\()STTKK ]c  ]  ]^C-@  ]  ]T]  ]  ]  ]     J#  J  J)<  J  JPY  J  J  J   /0000000s   ; BC?DB<Gr   c           
         t          | t                    r| st          d|  d          t          D ]!}|| v rt          dt           d|  d          "t	          |           |k    r&t          d| d|  dt	          |            d	          d
S )z
    Make sure the fingerprint is a non-empty string that is not longer that max_length=64 by default,
    so that the fingerprint can be used to name cache files without issues.
    zInvalid fingerprint 'z#': it should be a non-empty string.z5Invalid fingerprint. Bad characters from black list 'z' found in 'z6'. They could create issues when creating cache files.z&Invalid fingerprint. Maximum lenth is z but 'z' has length z2.It could create issues when creating cache files.N)rt   r   
ValueErrorr   len)r   
max_lengthinvalid_chars      r-   validate_fingerprintr   Z  s   
 k3'' c{ caaaabbb:  ;&&GHj G G  yD G G G   '
 ;*$$@Z @ @{ @ @adepaqaq @ @ @
 
 	
 %$r/   rh   versionc                 <    | j          d| j         }||d| z  }|S )zW
    Format a transform to the format that will be used to update the fingerprint.
    .N@)r;   r<   )rh   r   r   s      r-    format_transform_for_fingerprintr   n  s8     ?88T%688I]]]"	r/   Fargskwargs
use_kwargsignore_kwargsrandomized_functionc                    |                                 }|rmd t          j        |           j                                        D             }|dd         }|dd         }|                    t          ||                     n4|t          t          t          j        |           j                            = r fd|	                                D             }r fd|	                                D             }|r|
                    d          o|
                    d          Zt          j                                        ^}}	}
}|
dk     r|	|
         n|	d	         }	t          j                            |	          |d<   d
 t          j        |           j                                        D             }|	                                D ]*\  }}||v r!||         |k    r|                    |           +|S )ze
    Format the kwargs of a transform to the format that will be used to update the fingerprint.
    c                 2    g | ]}||j         k    |j        S r>   )VAR_KEYWORDr!   rC   ps     r-   
<listcomp>z1format_kwargs_for_fingerprint.<locals>.<listcomp>  s)    dddQQRVWVcQcQc!&QcQcQcr/   r   Nc                 $    i | ]\  }}|v 	||S r>   r>   )rC   kvr   s      r-   
<dictcomp>z1format_kwargs_for_fingerprint.<locals>.<dictcomp>  s+    !e!e!e41aUVZdUdUd!QUdUdUdr/   c                 $    i | ]\  }}|v	||S r>   r>   )rC   r   r   r   s      r-   r   z1format_kwargs_for_fingerprint.<locals>.<dictcomp>  s+    !l!l!l41aUV^kUkUk!QUkUkUkr/   seed	generatorip  r   c                 R    i | ]$}|j         t          j        k    |j        |j         %S r>   )defaultinspect_emptyr!   r   s     r-   r   z1format_kwargs_for_fingerprint.<locals>.<dictcomp>  s7       RSR[_f_mRmRm	RmRmRmr/   )copyr   	signature
parametersvaluesrv   zipnextiteritemsr   npr   	get_statedefault_rngpop)rh   r   r   r   r   r   kwargs_for_fingerprintparams_r   posdefault_valuesdefault_varnamedefault_values      ``         r-   format_kwargs_for_fingerprintr   x  s$    $[[]] 
dd'"3D"9"9"D"K"K"M"MdddABBx%%c&$&7&78888"g'--899::
  f!e!e!e!e3I3O3O3Q3Q!e!e!e m!l!l!l!l3I3O3O3Q3Q!l!l!l N!%%f--5:P:T:TU`:a:a:i!y2244AtS1 #c		499tAwD24)2G2G2M2M";/ #*#4T#:#:#E#L#L#N#N  N +9*>*>*@*@ 8 8&4449OP_9`dq9q9q"&&777!!r/   inplacefingerprint_namesc                 2    4t          t                    st          dt                               4t          t                    st          dt                                rrt          d          ndg fd}|S )a  
    Wrapper for dataset transforms to update the dataset fingerprint using ``update_fingerprint``
    Args:
        inplace (:obj:`bool`):  If inplace is True, the fingerprint of the dataset is updated inplace.
            Otherwise, a parameter "new_fingerprint" is passed to the wrapped method that should take care of
            setting the fingerprint of the returned Dataset.
        use_kwargs (:obj:`List[str]`, optional): optional white list of argument names to take into account
            to update the fingerprint to the wrapped method that should take care of
            setting the fingerprint of the returned Dataset. By default all the arguments are used.
        ignore_kwargs (:obj:`List[str]`, optional): optional black list of argument names to take into account
            to update the fingerprint. Note that ignore_kwargs prevails on use_kwargs.
        fingerprint_names (:obj:`List[str]`, optional, defaults to ["new_fingerprint"]):
            If the dataset transforms is not inplace and returns a DatasetDict, then it can require
            several fingerprints (one per dataset in the DatasetDict). By specifying fingerprint_names,
            one fingerprint named after each element of fingerprint_names is going to be passed.
        randomized_function (:obj:`bool`, defaults to False): If the dataset transform is random and has
            optional parameters "seed" and "generator", then you can set randomized_function to True.
            This way, even if users set "seed" and "generator" to None, then the fingerprint is
            going to be randomly generated depending on numpy's current state. In this case, the
            generator is set to np.random.default_rng(np.random.get_state()[1][0]).
        version (:obj:`str`, optional): version of the transform. The version is taken into account when
            computing the fingerprint. If a datase transform changes (or at least if the output data
            that are cached changes), then one should increase the version. If the version stays the
            same, then old cached data could be reused that are not compatible with the new transform.
            It should be in the format "MAJOR.MINOR.PATCH".
    Nz)use_kwargs is supposed to be a list, not z,ignore_kwargs is supposed to be a list, not z5fingerprint_names are only used when inplace is Falsenew_fingerprintc                 \    s*t           fdD                       st          d          rBd j        j        vrt          d  d          d j        j        vrt          d  d          t	                     t                      fd	            }d
|_        |S )Nc              3   4   K   | ]}|j         j        v V  d S r   )__code__co_varnames)rC   r!   rh   s     r-   rE   z>fingerprint_transform.<locals>._fingerprint.<locals>.<genexpr>  s-      "c"c44=+D#D"c"c"c"c"c"cr/   zFfunction {func} is missing parameters {fingerprint_names} in signaturer   z'seed' must be in z's signaturer   z'generator' must be in )r   c            	         t          | |	          }| r| d         }| dd          } nF|                    t          t          t	          j                  j                                      }
rt          |j        |          }nND ]K}|	                    |          ||d<   t          |j        |          ||<   6t          ||                    L |g| R i |}
r||_        |S )N)r   r   r   r   r   fingerprint_name)r   r   r   r   r   r   r   r   r   r   r   )r   r   r   rJ   r   r   outr   rh   r   r   r   r   r   s          r-   wrapperz<fingerprint_transform.<locals>._fingerprint.<locals>.wrapper  sC   %B%+$7& & &"  ^#'7ABBx#)::d48I$8O8O8Z3[3[.\.\#]#]  
G"4W5I9Vl"m"m(9 G G$zz"233;EU./AB3E#0)=S4 4/00 -V4D-EFFFF $w000000C  7'6$Jr/   r   )allr   r   r   r   r   _decorator_name_)	rh   r   r   r   r   r   r   r   r   s	   ` @r-   r   z+fingerprint_transform.<locals>._fingerprint  s	    	gs"c"c"c"cQb"c"c"ccc 	gefff 	OT]666 !Hd!H!H!HIII$-";;; !M4!M!M!MNNN4T7KKK		t&	 &	 &	 &	 &	 &	 &	 &	 &	 &	 
&	P $1 r/   )rt   rM   r   r   )r   r   r   r   r   r   r   s   `````` r-   fingerprint_transformr     s    F jT&B&BWT*EUEUWWXXX M4)H)H ZZHXHXZZ[[[ R$ RPQQQ->-J))QbPc6 6 6 6 6 6 6 6 6 6p r/   )r   r   )NNF)NNNFN)Hr   r   r1   r   r4   r   r"   	functoolsr   pathlibr   typingr   r   r   r   r	   r
   r   r   numpyr   pyarrowr   ro   r   r   namingr   tabler   r   r   r   utils.deprecation_utilsr   utils.loggingr   utils.py_utilsr   r   arrow_datasetr   r:   r   rR   r   r   r   rF   r   rK   rN   rS   rU   rY   rZ   r]   r   rc   rl   rf   r   r   r   r   r   r   r   r   r   r   r>   r/   r-   <module>r	     s     				                 S S S S S S S S S S S S S S S S S S S S                6 6 6 6 6 6 N N N N N N N N N N N N / / / / / / % % % % % % ) ) ) ) ) ) ) )  '&&&&&& 
H		   HL )D E L L L>B  (7?"; B B B       06 6 6&j j j  *  *  J % % % % %*"D " " " "*/S / / / /&  "" "" "" "" "" "" "" ""T bh}.?ASTT4 4 UT4 kX X X )+ d39o * * *S    7 7S 7 7 7 7/ / /d
 
c 
 
 
 
( 8 hsm WZ     '+)- %*" *"
*"
*" cN*" c#	*"
 DI&*" *" 
#s(^*" *" *" *"^ '+)--1 %!f ffc#f DI&f  S	*	f
 f c]f f f f f fr/   