
    +gdr7                     *   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
mZ d dlmZ d dlmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ  ee          Z G d
 d          Z G d de
          Z G d dee
          Z G d de          Z G d de          Z  G d de          Z! G d de          Z" G d de          Z# G d de          Z$ G d de          Z% G d de          Z& G d  d!e          Z' G d" d#          Z(dS )$    N)ABCabstractmethod)Path)DictListOptionalTypeUnion   )config   )FileLock)
get_loggerc                   b    e Zd Zddee         fdZdedefdZdededefd	ZddededefdZ	dS )ExtractManagerN	cache_dirc                     |r*t           j                            |t          j                  nt          j        | _        t          | _        d S N)	ospathjoinr   EXTRACTED_DATASETS_DIREXTRACTED_DATASETS_PATHextract_dir	Extractor	extractor)selfr   s     6lib/python3.11/site-packages/datasets/utils/extract.py__init__zExtractManager.__init__   s;    FOsBGLLF$ABBBU[Us 	 #    r   returnc                     ddl m} t          j                            |          }t          j                            | j         ||                    S )Nr   )hash_url_to_filename)
file_utilsr#   r   r   abspathr   r   )r   r   r#   abs_paths       r   _get_output_pathzExtractManager._get_output_path   sN    444444 7??4((w||D,.B.B8.L.LMMMr    output_pathforce_extractc                     |pSt           j                            |           o3t           j                            |          ot          j        |           S r   )r   r   isfileisdirlistdir)r   r(   r)   s      r   _do_extractzExtractManager._do_extract%   sK     
{+++lRW]];5O5O5kTVT^_jTkTk0l	
r    F
input_pathc                     | j                             |          }|s|S |                     |          }|                     ||          r| j                             |||           |S r   )r   infer_extractor_formatr'   r.   extract)r   r/   r)   extractor_formatr(   s        r   r2   zExtractManager.extract*   ss    >@@LL 	++J77K77 	NN"":{<LMMMr    r   F)
__name__
__module____qualname__r   strr   r'   boolr.   r2    r    r   r   r      s        # #(3- # # # #NS NS N N N N
s 
4 
D 
 
 
 

 # d s      r    r   c                       e Zd Zeedeeef         defd                        Z	e
edeeef         deeef         ddfd                        ZdS )BaseExtractorr   r!   c                     d S r   r:   clsr   kwargss      r   is_extractablezBaseExtractor.is_extractable5   	     	r    r/   r(   Nc                     d S r   r:   )r/   r(   s     r   r2   zBaseExtractor.extract:   rB   r    )r5   r6   r7   classmethodr   r
   r   r8   r9   rA   staticmethodr2   r:   r    r   r<   r<   4   s        %c	"2     ^ [ E$), 5s;K PT    ^ \  r    r<   c                       e Zd ZU g Zee         ed<   edee	e
f         defd            Zed
dee	e
f         dedefd            Zd	S )MagicNumberBaseExtractormagic_numbersr   magic_number_lengthc                     t          | d          5 }|                    |          cd d d            S # 1 swxY w Y   d S )Nrb)openread)r   rI   fs      r   read_magic_numberz*MagicNumberBaseExtractor.read_magic_numberC   s    $ 	/66-..	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/s   377r    magic_numberr!   c                     sGt          d | j        D                       }	 |                     ||          n# t          $ r Y dS w xY wt	          fd| j        D                       S )Nc              3   4   K   | ]}t          |          V  d S r   )len).0cls_magic_numbers     r   	<genexpr>z:MagicNumberBaseExtractor.is_extractable.<locals>.<genexpr>K   s,      %f%f@Pc*:&;&;%f%f%f%f%f%fr    Fc              3   B   K   | ]}                     |          V  d S r   )
startswith)rT   rU   rP   s     r   rV   z:MagicNumberBaseExtractor.is_extractable.<locals>.<genexpr>P   s3      ggAQ<**+;<<ggggggr    )maxrH   rO   OSErrorany)r?   r   rP   rI   s     ` r   rA   z'MagicNumberBaseExtractor.is_extractableH   s     	"%%f%fTWTe%f%f%f"f"f"44T;NOO   uuggggUXUfggggggs   : 
AANr    )r5   r6   r7   rH   r   bytes__annotations__rE   r
   r   r8   intrO   rD   r9   rA   r:   r    r   rG   rG   @   s         !#M4;###/dCi 0 /s / / / \/ h h%c	"2 h% hRV h h h [h h hr    rG   c                       e Zd Zedeeef         defd            Ze	d             Z
e	deeef         deeef         ddfd            ZdS )	TarExtractorr   r!   c                 *    t          j        |          S r   )tarfile
is_tarfiler>   s      r   rA   zTarExtractor.is_extractableT   s    !$'''r    c              #   P  K   dt           dt           fddt           dt           dt          ffddt           dt          ffd} |          }| D ]} |j        |          r$t                              d|j         d           7|                                r7 |||          r+t                              d|j         d	|j                    |                                r7 |||          r+t                              d|j         d
|j                    |V  dS )a  
        Fix for CVE-2007-4559
        Desc:
            Directory traversal vulnerability in the (1) extract and (2) extractall functions in the tarfile
            module in Python allows user-assisted remote attackers to overwrite arbitrary files via a .. (dot dot)
            sequence in filenames in a TAR archive, a related issue to CVE-2001-1267.
        See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2007-4559
        From: https://stackoverflow.com/a/10077309
        r   r!   c                 z    t           j                            t           j                            |                     S r   )r   r   realpathr%   )r   s    r   resolvedz*TarExtractor.safemembers.<locals>.resolvedd   s&    7##BGOOD$9$9:::r    basec                 ~     t           j                            ||                                         |           S r   )r   r   r   rX   )r   ri   rh   s     r   badpathz)TarExtractor.safemembers.<locals>.badpathg   s4    xT4 8 899DDTJJJJr    c                      t           j                            |t           j                            | j                                      } | j        |          S )N)ri   )r   r   r   dirnamenamelinkname)infori   tiprk   rh   s      r   badlinkz)TarExtractor.safemembers.<locals>.badlinkk   sI    (27<<bgoodi.H.HIIJJC74=s3333r    zExtraction of z is blocked (illegal path)z is blocked: Symlink to z is blocked: Hard link to N)r8   r9   rn   loggererrorissymro   islnk)membersr(   rr   ri   finfork   rh   s        @@r   safememberszTarExtractor.safemembersX   s     	;3 	;3 	; 	; 	; 	;	K# 	KS 	KT 	K 	K 	K 	K 	K 	K	4 	4 	4 	4 	4 	4 	4 	4 	4
 x$$ 	 	Ewuz4(( TejTTTUUUU 775$#7#7 bejbbRWR`bbcccc 775$#7#7 dejddTYTbddeeee	 	r    r/   r(   Nc                     t          j        |d           t          j        |           }|                    |t
                              ||                     |                                 d S )NTexist_ok)rw   )r   makedirsrc   rL   
extractallra   ry   close)r/   r(   tar_files      r   r2   zTarExtractor.extract|   sf    
K$////<
++K1I1I(T_1`1`aaar    )r5   r6   r7   rD   r
   r   r8   r9   rA   rE   ry   r2   r:   r    r   ra   ra   S   s        (%c	"2 ( ( ( ( [( ! ! \!F E$), 5s;K PT    \  r    ra   c                   X    e Zd ZdgZedeeef         deeef         ddfd            ZdS )GzipExtractors   r/   r(   r!   Nc                     t          j        | d          5 }t          |d          5 }t          j        ||           d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S NrK   wb)gziprL   shutilcopyfileobj)r/   r(   	gzip_fileextracted_files       r   r2   zGzipExtractor.extract   s    Yz4(( 	>Ik4(( >N"9n===> > > > > > > > > > > > > > >	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	>3   A!A	A!	A	A!A	A!!A%(A%	r5   r6   r7   rH   rE   r
   r   r8   r2   r:   r    r   r   r      sa         MM>E$), >5s;K >PT > > > \> > >r    r   c                        e Zd Zg dZeddeeef         dede	f fd            Z
edeeef         deeef         dd	fd
            Z xZS )ZipExtractor)s   PKs   PKs   PKr    r   rP   r!   c                    t                                          ||          rdS 	 ddlm}m}m}m}m}m}m	}	m
}
m}m} t          |d          5 } |	|          }|r||         dk    r&||         dk    r||         dk    r	 d d d            dS ||         ||         k    r|                    ||                    |                                ||         k    rc||         |
k    rW|                    |
          }t#          |          |
k    r/t%          j        ||          }||         |k    r	 d d d            dS d d d            n# 1 swxY w Y   dS # t(          $ r Y dS w xY w)NrP   Tr   )
_CD_SIGNATURE_ECD_DISK_NUMBER_ECD_DISK_START_ECD_ENTRIES_TOTAL_ECD_OFFSET	_ECD_SIZE_EndRecDatasizeCentralDirstringCentralDirstructCentralDirrK   F)superrA   zipfiler   r   r   r   r   r   r   r   r   r   rL   seektellrM   rS   structunpack	Exception)r?   r   rP   r   r   r   r   r   r   r   r   r   r   fpendrecdatacentdir	__class__s                    r   rA   zZipExtractor.is_extractable   sk   77!!$\!BB 	4	                        dD!! 0R$R 
001Q666);LPQ;Q;QV\]hVimnVnVn#	0 0 0 0 0 0 0 0
   01VO5LLL{ 34447799{(;;;y@QUc@c@c#%77>#:#:D"4yyN::*0-8H$*O*O#*=#9=M#M#M+/0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 	 	 	55	sH   (E 3E
E B!E
1E >E 
EE EE 
E%$E%r/   r(   Nc                     t          j        |d           t          j        | d          5 }|                    |           |                                 d d d            d S # 1 swxY w Y   d S )NTr{   r)r   r}   r   ZipFiler~   r   )r/   r(   zip_files      r   r2   zZipExtractor.extract   s    
K$////_Z-- 	,,,NN	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   *A##A'*A'r\   )r5   r6   r7   rH   rD   r
   r   r8   r]   r9   rA   rE   r2   __classcell__)r   s   @r   r   r      s          M " "%c	"2 "% "RV " " " " " ["H E$), 5s;K PT    \    r    r   c                   X    e Zd ZdgZedeeef         deeef         ddfd            ZdS )XzExtractors   7zXZ r/   r(   r!   Nc                     t          j        |           5 }t          |d          5 }t          j        ||           d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S )Nr   )lzmarL   r   r   r/   r(   compressed_filer   s       r   r2   zXzExtractor.extract   s   Yz"" 	Dok4(( DN"?NCCCD D D D D D D D D D D D D D D	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	Ds3   A AA A	A A	A  A$'A$r   r:   r    r   r   r      sk        01MDE$), D5s;K DPT D D D \D D Dr    r   c                   Z    e Zd ZddgZedeeef         deeef         ddfd            ZdS )RarExtractors   Rar! s   Rar! r/   r(   r!   Nc                     t           j        st          d          dd l}t	          j        |d           |                    |           }|                    |           |                                 d S )NzPlease pip install rarfiler   Tr{   )	r   RARFILE_AVAILABLEImportErrorrarfiler   r}   RarFiler~   r   )r/   r(   r   rfs       r   r2   zRarExtractor.extract   sn    ' 	<:;;;
K$////__Z((
k"""





r    r   r:   r    r   r   r      se        (*ABME$), 5s;K PT    \  r    r   c                   X    e Zd ZdgZedeeef         deeef         ddfd            ZdS )ZstdExtractors   (/r/   r(   r!   Nc                 :   t           j        st          d          dd l}|                                }t          | d          5 }t          |d          5 }|                    ||           d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S )NzPlease pip install zstandardr   rK   r   )r   ZSTANDARD_AVAILABLEr   	zstandardZstdDecompressorrL   copy_stream)r/   r(   zstddctxifhofhs         r   r2   zZstdExtractor.extract   s!   ) 	><===    $$&&*d## 	'sDd,C,C 	'sS#&&&	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	's6   BA8,B8A<	<B?A<	 BBBr   r:   r    r   r   r      sb        ()M'E$), '5s;K 'PT ' ' ' \' ' 'r    r   c                   X    e Zd ZdgZedeeef         deeef         ddfd            ZdS )Bzip2Extractors   BZhr/   r(   r!   Nc                     t          j        | d          5 }t          |d          5 }t          j        ||           d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S r   )bz2rL   r   r   r   s       r   r2   zBzip2Extractor.extract   s   Xj$'' 	D?k4(( DN"?NCCCD D D D D D D D D D D D D D D	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	Dr   r   r:   r    r   r   r      sk        $%MDE$), D5s;K DPT D D D \D D Dr    r   c                   X    e Zd ZdgZedeeef         deeef         ddfd            ZdS )SevenZipExtractors   7z'r/   r(   r!   Nc                     t           j        st          d          dd l}t	          j        |d           |                    | d          5 }|                    |           d d d            d S # 1 swxY w Y   d S )NzPlease pip install py7zrr   Tr{   r   )r   PY7ZR_AVAILABLEr   py7zrr   r}   SevenZipFiler~   )r/   r(   r   archives       r   r2   zSevenZipExtractor.extract   s    % 	:8999
K$////
C00 	,G{+++	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	,s   A//A36A3r   r:   r    r   r   r      sb        01M,E$), ,5s;K ,PT , , , \, , ,r    r   c                   X    e Zd ZdgZedeeef         deeef         ddfd            ZdS )Lz4Extractors   "Mr/   r(   r!   Nc                 &   t           j        st          d          dd l}|j                            | d          5 }t          |d          5 }t          j        ||           d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S )NzPlease pip install lz4r   rK   r   )r   LZ4_AVAILABLEr   	lz4.frameframerL   r   r   )r/   r(   lz4r   r   s        r   r2   zLz4Extractor.extract  s7   # 	86777Y^^J-- 	Dk4(( DN"?NCCCD D D D D D D D D D D D D D D	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	Ds5   BA."B.A2	2B5A2	6BB
B
r   r:   r    r   r   r     sk        ()MDE$), D5s;K DPT D D D \D D Dr    r   c                   d   e Zd ZU eeeeeee	e
ed	Zeeee         f         ed<   ed             Zedeeef         defd            Zeddeeef         ded	efd
            Zedeeef         d	efd            Ze	 	 ddeeef         deeef         dee         dee         d	df
d            ZdS )r   )	tarr   zipxzrarr   r   7zr   
extractorsc                 b    t          d | j                                        D                       S )Nc              3   r   K   | ]2}t          |t                    |j        D ]}t          |          V  3d S r   )
issubclassrG   rH   rS   )rT   r   extractor_magic_numbers      r   rV   z9Extractor._get_magic_number_max_length.<locals>.<genexpr>  sn       
 
)%=>>
 +4*A	
 
 ' &''
 
 
 
 
 
 
r    )rY   r   values)r?   s    r   _get_magic_number_max_lengthz&Extractor._get_magic_number_max_length  s>     
 
 ^2244
 
 
 
 
 	
r    r   rI   c                 ^    	 t                               | |          S # t          $ r Y dS w xY w)N)rI   r    )rG   rO   rZ   )r   rI   s     r   _read_magic_numberzExtractor._read_magic_number&  sC    	+==dXk=lll 	 	 	33	s    
,,Freturn_extractorr!   c                     t          j        dt                     |                     |          }|r|sdnd| j        |         fS |sdndS )Nz{Method 'is_extractable' was deprecated in version 2.4.0 and will be removed in 3.0.0. Use 'infer_extractor_format' instead.categoryTF)FN)warningswarnFutureWarningr1   r   )r?   r   r   r3   s       r   rA   zExtractor.is_extractable-  sm    4"	
 	
 	
 	

 55d;; 	^/]44dCNK[<\5]],?uu-?r    c                     |                                  }|                     ||          }| j                                        D ] \  }}|                    ||          r|c S !d S )Nr   )r   r   r   itemsrA   )r?   r   magic_number_max_lengthrP   r3   r   s         r   r1   z Extractor.infer_extractor_format9  s    "%"B"B"D"D--d4KLL+.>+?+?+A+A 	( 	('i''<'HH (''''(	( 	(r    N
deprecatedr/   r(   r3   r   c                    t          j        t           j                            |          d           t	          t          |                              d                    }t          |          5  t          j	        |d           |s|dk    rp|dk    st          |t                    s&t          j        dt                     |dk    r|n|}n| j        |         }|                    ||          cd d d            S t          j        dt                     | j                                        D ];}|                    |          r$|                    ||          c cd d d            S <	 d d d            d S # 1 swxY w Y   d S )	NTr{   z.lock)ignore_errorsr   zsParameter 'extractor' was deprecated in version 2.4.0 and will be removed in 3.0.0. Use 'extractor_format' instead.r   ztParameter 'extractor_format' was made required in version 2.4.0 and not passing it will raise an exception in 3.0.0.)r   r}   r   rm   r8   r   with_suffixr   r   rmtree
isinstancer   r   r   r   r2   r   rA   )r?   r/   r(   r3   r   	lock_paths         r   r2   zExtractor.extractA  sM    	BGOOK004@@@@[))55g>>??	i   	J 	JM+T:::: J9#<#<,,J?OQT4U4U,M:!.   
 .7,-F-F		L\II #/? @I (([AA	J 	J 	J 	J 	J 	J 	J 	J **   
 "%!6!6!8!8 J JI //
;; J(00[IIII+	J 	J 	J 	J 	J 	J 	J 	J(JJ'	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	Js    2BE?A"E?0E??FFr4   )Nr   )r5   r6   r7   ra   r   r   r   r   r   r   r   r   r   r   r8   r	   r<   r^   rD   r   rE   r
   r   r_   r   r9   rA   r1   r   r2   r:   r    r   r   r     s         
2 
2JS$}--. 
 
 
 
 
 [
 tSy!1     \ 	@ 	@%c	"2 	@d 	@W[ 	@ 	@ 	@ [	@ (%c	*: (s ( ( ( [( 
 +/-9J J$)$J 49%J #3-	J
 M*J 
J J J [J J Jr    r   ))r   r   r   r   r   r   rc   r   r   abcr   r   pathlibr   typingr   r   r   r	   r
    r   filelockr   loggingr   r5   rs   r   r<   rG   ra   r   r   r   r   r   r   r   r   r   r:   r    r   <module>r      sY   



   				      # # # # # # # #       4 4 4 4 4 4 4 4 4 4 4 4 4 4                   
H		       <	 	 	 	 	C 	 	 	h h h h h}c h h h&. . . . .= . . .b> > > > >, > > >1 1 1 1 1+ 1 1 1hD D D D D* D D D    +   ' ' ' ' ', ' ' 'D D D D D- D D D, , , , ,0 , , ,D D D D D+ D D DRJ RJ RJ RJ RJ RJ RJ RJ RJ RJr    