
    +gdBZ                     R   d dl Z d dlZd dlZd dlZd dlZd dlmc mZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZm Z  d dl!m"Z"  ee#          Z$dZ%d Z& G d de          Z' ed           G d de                      Z(dS )    N)ArgumentParser)Path)Optional)config)BaseDatasetsCLICommand)DownloadConfig)DownloadManager)MockDownloadManager)dataset_module_factoryimport_main_class)
deprecated)
get_loggerset_verbosity_warning)
map_nestedzutf-8c                     t          | j        | j        | j        | j        | j        | j        | j        | j        | j	        	  	        S N)
DummyDataCommandpath_to_datasetauto_generaten_lines
json_fieldxml_tagmatch_text_fileskeep_uncompressed	cache_direncoding)argss    <lib/python3.11/site-packages/datasets/commands/dummy_data.pydummy_data_command_factoryr      sE    
 
 
    c                       e Zd Z fdZ fdZ fdZ	 	 	 	 	 ddedee         dee         d	ee         d
ee         de	fdZ
	 	 	 	 ddedededee         dee         d	ee         d
ee         defdZedefd            Zd Z xZS )!DummyDataGeneratorDownloadManagerc                 d     t                      j        |i | || _        g | _        g | _        d S r   )super__init__mock_download_managerdownloaded_dummy_pathsexpected_dummy_paths)selfr&   r   kwargs	__class__s       r   r%   z*DummyDataGeneratorDownloadManager.__init__*   s=    $)&)))%:"&(#$&!!!r    c                     t                                          |          }| j                            |          }t          | j        j        |d           t          | j        j        |d           |S NT)	map_tuple)r$   downloadr&   r   r'   appendr(   r)   url_or_urlsoutputdummy_outputr+   s       r   r/   z*DummyDataGeneratorDownloadManager.download0   sk    !!+..1::;GG4.5vNNNN4,3\TRRRRr    c                 ,   t                                          t                                          |                    }| j                            |          }t	          | j        j        |d           t	          | j        j        |d           |S r-   )r$   extractr/   r&   r   r'   r0   r(   r1   s       r   download_and_extractz6DummyDataGeneratorDownloadManager.download_and_extract7   s{    !1!1+!>!>??1::;GG4.5vNNNN4,3\TRRRRr       Nr   r   r   r   r   returnc                    t          j        t           j                            | j        j        | j        j        | j        j        d          d           d}d| j        _        t          | j
        | j                  D ]d\  }}t           j                            | j        j        | j        j        | j        j        |          }	||                     ||	|||||          z  }e|dk    rt                              d           |dk    S )N
dummy_dataTexist_okr   Fr   r   r   r   r   zDummy data generation failed: no dummy files were created. Make sure the data files format is supported by the auto-generation.)osmakedirspathjoinr&   datasets_scripts_dirdataset_namedummy_data_folderload_existing_dummy_datazipr'   r(   _create_dummy_dataloggererror)
r)   r   r   r   r   r   totalsrc_pathrelative_dst_pathdst_paths
             r   auto_generate_dummy_data_folderzADummyDataGeneratorDownloadManager.auto_generate_dummy_data_folder>   s*    	GLL*?*7*<	  	
 	
 	
 	
 >C";+.t/JDLe+f+f 	 	'H'w||*?*7*<!	 H T,,%!1! -   EE A::LLW   qyr    rL   rN   c                    |pt           }t          j                            |          r\t                              d|            t          |          j        g d}t          fd|D                       }	|Ot          j        	                    |          }
|
                    d          D ]}|	t          j        |
|          z  }	|	rt          |          j                            dd           t          ||          5 }t          |d|          5 }g }t          |          D ]"\  }}|k    r n|                    |           #|                    d	                    |                                                     d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   d
S dv rTt          ||          5 }t)          j        |          }|||         }t-          |t.                    r~t1          d |                                D                       s2t5          dt7          |                                           d          fd|                                D             }n
|d          }|||i}t          |          j                            dd           t          |d|          5 }t)          j        ||           d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   d
S t          fddD                       r9|t                              d           n|                      ||||           d
S t                              d| d           dS t          j        !                    |          rd}t          j"        |          D ]\  }}}|D ]}|#                    d          st          j                            ||          }t          j                            |t          |          $                    |                    }|| %                    ||||||          z  }|S d S )Nz#Trying to generate dummy data file )z.txtz.csvz.jsonlz.tsvc              3       K   | ]}|v V  	d S r    .0	extensiondst_path_extensionss     r   	<genexpr>zGDummyDataGeneratorDownloadManager._create_dummy_data.<locals>.<genexpr>w   s*      +v+vQZI9L,L+v+v+v+v+v+vr    ,Tr=   parentsr   w    z.jsonc              3   @   K   | ]}t          |t                    V  d S r   )
isinstancelist)rT   vs     r   rW   zGDummyDataGeneratorDownloadManager._create_dummy_data.<locals>.<genexpr>   s,      "S"S1:a#6#6"S"S"S"S"S"Sr    zCouldn't parse columns z\. Maybe specify which json field must be used to read the data with --json_field <my_field>.c                 ,    i | ]\  }}||d          S r   rR   )rT   krb   r   s      r   
<dictcomp>zHDummyDataGeneratorDownloadManager._create_dummy_data.<locals>.<dictcomp>   s'    *X*X*Xda1ak*X*X*Xr    c              3       K   | ]}|v V  	d S r   rR   rS   s     r   rW   zGDummyDataGeneratorDownloadManager._create_dummy_data.<locals>.<genexpr>   s)      XX)Y"55XXXXXXr    )z.xmlz.txmzEFound xml file but 'xml_tag' is set to None. Please provide --xml_tag)r   r   zCouldn't generate dummy file 'z9'. Ignore that if this file is not useful for dummy data.r   .r>   )&DEFAULT_ENCODINGr?   rA   isfilerI   debugr   suffixesanybasenamesplitfnmatchparentmkdiropen	enumerater0   writerB   stripjsonloadr`   dictallvalues
ValueErrorra   keysitemsdumpwarning_create_xml_dummy_dataisdirwalk
startswithrelative_torH   )r)   rL   rN   r   r   r   r   r   line_by_line_extensionsis_line_by_line_text_file	file_namepatternsrc_filedst_filefirst_linesiline	json_datafirst_json_datarK   rA   _filesnamesrc_file_pathdst_file_pathrV   s      `                      @r   rH   z4DummyDataGeneratorDownloadManager._create_dummy_datah   s    //7>>(## G	LLIxIIJJJ"&x.."9&H&H&H#(++v+v+v+v^u+v+v+v(v(v%+G,,X66	/55c:: U UG-G1T1TT--( 'X%++T4+HHH(X666 E(hh??? E8&('0':': 5 5GAt G|| %'..t4444 rww{';';'A'A'C'CDDDE E E E E E E E E E E E E E EE E E E E E E E E E E E E E E q///(X666 =( $	( 3 3I!-$-j$9	!)T22 	>""S"S	@P@P@R@R"S"S"SSS ",!Q$y~~?O?O:P:P !Q !Q !Q# # 
 +Y*X*X*XiooFWFW*X*X*X*3HWH*=!-+5*GNN)//t/LLLhh??? =8	/8<<<= = = = = = = = = = = = = = =#= = = = = = = = = = = = = = =& qXXXXGWXXXXX ?NN#jkkkk//(GU\go/pppqNNwwww   1W]]8$$ 	E"$'("3"3  a!  D??3// (*T4(@(@(*XtM?R?R?^?^_g?h?h(i(i!8!8))$+'1$+-=%- "9 " " 	 L!	 	sn   	F.A/FF.F	F.F	F..F25F2C=LK0$L0K4	4L7K4	8LLLc                    t          |          j                            dd           t          | |          5 }d}g }t	          j        |d          D ]i\  }}	|dk    r|                    |	           !|                                }
|	j        |k    r)||k     r|dz  }L|r|d	         	                    |	           jt	          j
        |	
                              ||           d d d            d S # 1 swxY w Y   d S )NTrY   r[   r   )startend)eventsr   r^   )element)r   rp   rq   rr   ET	iterparser0   poptagremoveElementTreert   )rL   rN   r   r   r   r   n_linerZ   eventelemr   s              r   r   z8DummyDataGeneratorDownloadManager._create_xml_dummy_data   so   X##T4#@@@(X... 	L(FG!|H=MNNN 
9 
9tG##NN4((((Ax7**!G++"aKFF& 9 ' 2 24 8 8 8N4(((..x(.KKK	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	Ls   B/C77C;>C;c                 0   t           j                            || j        j                  }t           j                            |d          }d}t
                              d| d           t          j        |d||           t          j	        |           d S )Nr;   z"Compressing dummy data folder to 'z.zip'rG   )
r?   rA   rB   r&   rE   rI   infoshutilmake_archivermtree)r)   r   root_dir	base_namebase_dirs        r   !compress_autogenerated_dummy_datazCDummyDataGeneratorDownloadManager.compress_autogenerated_dummy_data   s    7<<1K1]^^GLL<88	IIIIJJJIuhAAAi     r    )r8   NNNN)NNNN)__name__
__module____qualname__r%   r/   r7   intr   strboolrO   rH   staticmethodrh   r   r   __classcell__)r+   s   @r   r"   r"   )   s       ' ' ' ' '         $(!%*."&( (( SM( #	(
 #3-( 3-( 
( ( ( (^ %)!%*."&R RR R 	R
 SMR #R #3-R 3-R 
R R R Rh DEP` L L L \L$! ! ! ! ! ! !r    r"   zThe `datasets` repository does not host the dataset scripts anymore. Therefore, dummy data is no longer needed to test their loading with CI.c                       e Zd Zedefd            Zdededede	e         de	e         de	e         d	ed
e	e         de	e         fdZ
d Zde	e         fdZd ZdS )r   parserc                 |   |                      dd          }|                    ddd           |                    dt          d	d
           |                    dt          d d           |                    dt          d d           |                    dt          d d           |                    ddd           |                    dt          d d           |                    dt          d dt                      |                    dt          d           |                    t                     d S )Nr;   zGenerate dummy data.)helpz--auto_generate
store_truez!Automatically generate dummy data)actionr   z	--n_linesr8   zBNumber of lines or samples to keep when auto-generating dummy data)typedefaultr   z--json_fieldzOptional, json field to read the data from when auto-generating dummy data. In the json data files, this field must point to a list of samples as json objects (ex: the 'data' field for squad-like files)z	--xml_tagz[Optional, xml tag name of the samples inside the xml files when auto-generating dummy data.z--match_text_fileszOptional, a comma separated list of file patterns that looks for line-by-line text files other than *.txt or *.csv. Example: --match_text_files *.labelz--keep_uncompressedzWhether to leave the dummy data folders uncompressed when auto-generating dummy data. Useful for debugging for to do manual adjustements before compressing.z--cache_dirzKCache directory to download and cache files when auto-generating dummy dataz
--encodingz=Encoding to use when auto-generating dummy data. Defaults to r   z/Path to the dataset (example: ./datasets/squad))r   r   )func)
add_parseradd_argumentr   r   rh   set_defaultsr   )r   test_parsers     r   register_subcommandz$DummyDataCommand.register_subcommand   s   '';Q'RR  !2<Nq rrr  c13w 	! 	
 	
 	
 	   ^	 	! 	
 	
 	
 	  n	 	! 	
 	
 	
 	    k	 	! 	
 	
 	
 	  ! p 	! 	
 	
 	

 	  ^	 	! 	
 	
 	
 	  cQacc	 	! 	
 	
 	
 	  !2Ct uuu  &@ AAAAAr    r   r   r   r   r   r   r   r   r   c
                    || _         t          j                            |          r?|                    t          j        d                              d          d         | _        n>|                    t          j        d                              d          d         | _        t          j                            |pt          j
                  }|| _        || _        || _        || _        || _        || _        || _        |	| _        d S )N/r   )_path_to_datasetr?   rA   r   replaceseprn   _dataset_name
expanduserr   HF_DATASETS_CACHE_auto_generate_n_lines_json_field_xml_tag_match_text_files_keep_uncompressed
_cache_dir	_encoding)
r)   r   r   r   r   r   r   r   r   r   s
             r   r%   zDummyDataCommand.__init__	  s     !07==)) 	U!0!8!8!E!E!K!KC!P!PQS!TD!0!8!8!E!E!K!KC!P!PQS!TDG&&y'LF4LMM	+%!1"3#!r    c           	         t                       t          | j                  }t          |j                  }|j        pd g}g }t          j                    5 }|D ]}|r|j        nd } |||j	        |          }|r|j
        n|j        j
        }	t          | j        ||	dd          }
| j        r1|                    |                     ||
| j                             |                     ||
           | j        rG| j        s@t'          |          rt)          d| j         d           nt)          d	| j         d           d d d            d S # 1 swxY w Y   d S )
N)config_namehashr   TF)rD   r   versionuse_local_dummy_datarF   )dataset_buildermock_dl_managerr   )r   r   z>Automatic dummy data generation succeeded for all configs of ''z<Automatic dummy data generation failed for some configs of ')r   r   r   r   module_pathBUILDER_CONFIGStempfileTemporaryDirectoryr   r   r   r   r
   r   r   r0   _autogenerate_dummy_datar   _print_dummy_data_instructionsry   print)r)   dataset_modulebuilder_clsbuilder_configsauto_generate_resultstmp_dirbuilder_configr   r   r   r   s              r   runzDummyDataCommand.run$  s"   /0EFF'(BCC &5?$ "(** 	sg"1  5CMn11"-++NL_kr"s"s"s4Bf.00H^Hf"5!%!3)#)--2# # # & )0055,;,;.2.E 6      77(7 8     " s4+B s,-- ss[_[psssttttqY]Ynqqqrrr;	s 	s 	s 	s 	s 	s 	s 	s 	s 	s 	s 	s 	s 	s 	s 	s 	s 	ss   C/EEEr9   c                 "   | j         r/t          j                            | j         t          j                  nt          j        }t          |          }t          | j	        ||          }|
                    |           d|_        |                    | j        | j        | j        | j        | j                   |st          j                            |j        |j                  }|                    |           d|_        i t          j        |j         d           	 |
                    |          }|D ]-}	|                    |	d           |	j        j        |	j        <   .	 t5          d                                 D                       r*t8                              d	|j        j         d
           dS fdD             }
t8                              d|
 d|j        j         d
           dS # t<          $ rC}t8                              d|j        j         dtA          |          z              Y d }~dS d }~ww xY wt          j                            | j!        |j"                  }t8          #                    d| d           d S )N)r   )rD   r&   download_configFr>   Tr<   )check_duplicate_keysc              3   "   K   | ]
}|d k    V  dS )r   NrR   )rT   
n_exampless     r   rW   z<DummyDataCommand._autogenerate_dummy_data.<locals>.<genexpr>q  s&      VV*zA~VVVVVVr    zEDummy data generation done and dummy data test succeeded for config 'z''.c                 ,    g | ]}|         d k    |S )r   rR   )rT   
split_namen_examples_per_splits     r   
<listcomp>z=DummyDataCommand._autogenerate_dummy_data.<locals>.<listcomp>w  s2     $ $ $'1MablMmqrMrMr
MrMrMrr    zCDummy data generation done but dummy data test failed since splits z have 0 examples for config 'z&Failed to load dummy data for config 'z''.
Original error:
z#Dummy data generated in directory 'zg' but kept uncompressed. Please compress this directory into a zip file to use it for dummy data tests.)$r   r?   rA   rB   r   DOWNLOADED_DATASETS_DIRDOWNLOADED_DATASETS_PATHr   r"   r   _split_generatorsrF   rO   r   r   r   r   r   rC   rD   r   r@   _prepare_split
split_infonum_examplesr   ry   rz   rI   r   OSErrorrJ   r   r   rE   r   )r)   r   r   r   dl_cache_dirr   
dl_managerpath_do_datasetsplit_generatorssplit_generatorempty_splitsegenerated_dummy_data_dirr   s                @r   r   z)DummyDataCommand._autogenerate_dummy_dataK  s,    1BGLL&*HIII0 	
 )<@@@6+?ds
 
 

 	))*55538022M'M!3^ 	3 	
 	
 	
 ! %	 gll?+OQ`QmnnO88III7;O4#% K2TBBBB!#2#D#D_#U#U '7 i iO#22?Y^2___APA[Ah()=>>i VV8L8S8S8U8UVVVVV !NN A`o`v`{  A  A  A    4$ $ $ $5I$ $ $L NN j^j  j  j  JY  J`  Je  j  j  j   !5'    p_=S=Xppp!ff   uuuuu* (*w||D4I?Kl'm'm$KKa6N a a a    s   !AG7 7
I8H??Ic           
         t           j                            | j        |j                  }t
                              d| d           t          j        |d           	 |                    |          }n?# t          $ r2}t          d| j         d|j         d|j         d           Y d }~nd }~ww xY wt                      }g }|j        }|D ]}	t
                              d	|	j                    |                    |	j                   |	j        }
 |j        d5i |
}	 d
}|j        d|j        j         dnd}|d|z   | j         d| d| dz   z  }|D ]\  }}|d| dz  }# t          $ r$}|                    |j                   Y d }~d }~ww xY wd                    |          }t+          |          dk    rt+          |          dk    rLt-          t/          |                    |k    r,|dt-          t/          |                     d| d| dz  }|}n*d                    |          }|d| d| dz  }|d| dz  }|d| d z  }t+          |          dk    rKt-          t/          |                    |k    r+|d!| d"| d#| d$z  }|d%| d&| d$z  }|d'| d(| d)| d*z  }n*|d+| d,| d#| d-z  }|d.| d/| d$z  }|d0| d(| d)| d*z  }|d1| d2| d3z  }|d4z  }t          |           d S )6Nz$Creating dummy folder structure for z... Tr<   zDataset z with config a   seems to already open files in the method `_split_generators(...)`. You might consider to instead only open files in the method `_generate_examples(...)` instead. If this is not possible the dummy data has to be created with less guidance. Make sure you create the file rg   z/Collecting dummy data file paths to create for zU
==============================DUMMY DATA INSTRUCTIONS==============================
zconfig z of r]   z(- In order to create the dummy data for z, please go into the folder 'z' with `cd z` . 

za- It appears that the function `_generate_examples(...)` expects one or more files in the folder z using the function `glob.glob(...)`. In this case, please refer to the `_generate_examples(...)` method to see under which filename the dummy data files should be created. 

z, r   r^   z1- Please create a single dummy data file called 'z' from the folder 'zV'. Make sure that the dummy data file provides at least one example for the split(s) 'z' 

z0- Please create the following dummy data files 'z'

z- For each of the splits 'zU', make sure that one or more of the dummy data files provide at least one example 

z- If the method `_generate_examples(...)` includes multiple `open()` statements, you might have to create other files in addition to 'zG'. In this case please refer to the `_generate_examples(...)` method 

z@- After the dummy data file is created, it should be zipped to 'z.zip' with the command `zip z.zip z` 

z- You can now delete the file 'z' with the command `rm z- To get the file 'z;' back for further changes to the dummy data, simply unzip z.zip with the command `unzip z.zip` 

zP- After all dummy data files are created, they should be zipped recursively to 'z.zip' with the command `zip -r z/` 

z!- You can now delete the folder 'z' with the command `rm -r z- To get the folder 'z'- Make sure you have created the file 'z
.zip' in 'z' 
zT===================================================================================
rR   )r?   rA   rB   r   rE   rI   r   r@   r   FileNotFoundErrorr   r   r   filenamesetdummy_file_namer   r0   
gen_kwargs_generate_examplesaddlennextiter)r)   r   r   rE   generator_splitsr   files_to_createsplit_namesr  rn   r  	generatordummy_data_guidance_printconfig_stringkeyrecordfiles_strings                    r   r   z/DummyDataCommand._print_dummy_data_instructions  s   GLL)>@abbR;LRRRSSS
%5555	.@@QQ  	 	 	 q4-  q  qO<R  q  q  de  dn  q  q  q       	
 %%)9% 	0 	0EKKV%*VVWWWuz***)J::HHZHHI0,i)CRCYCe?o49????km  *>#$+  D  DJ[  D  Dhy  D  D  DD) $-  KC)  .T  Ra  .T  .T  .T  T))$ 0 0 0##AJ////////0 ii,,!##?##q((T$2G2G-H-HO-[-[)  .Naefjkzf{f{a|a|  .N  .N  Rc  .N  .N  {F  .N  .N  .N  N).#yy99)  .Z`l  .Z  .Z  BS  .Z  .Z  .Z  Z))  .o+  .o  .o  .o  o)%  *J  s  *J  *J  *J  J%1$$d?.C.C)D)D)W)W%  *Gl{  *G  *G  Zi  *G  *G  p  *G  *G  *G  G%%q/qqZiqqq% &  *U  *U  *U  L[  *U  *U  zI  *U  *U  *U  U%%%  *[  }L  *[  *[  m|  *[  *[  CR  *[  *[  *[  [%%vOvv_nvvv% &  *W  *W  *W  N]  *W  *W  |K  *W  *W  *W  W%!hohhQbhhh	
! 	"_4!'(((((s1    A6 6
B2 (B--B2!AE''
F1FFN)r   r   r   r   r   r   r   r   r   r   r%   r   r   r   rR   r    r   r   r      s        *BN *B *B *B \*BX"" " 	"
 SM" #" #3-"  " C=" 3-" " " "6%s %s %sN8_ghl_m 8 8 8 8tO) O) O) O) O)r    r   ))ro   rv   r?   r   r   xml.etree.ElementTreeetreer   r   argparser   pathlibr   typingr   datasetsr   datasets.commandsr   !datasets.download.download_configr   "datasets.download.download_managerr	   'datasets.download.mock_download_managerr
   datasets.loadr   r    datasets.utils.deprecation_utilsr   datasets.utils.loggingr   r   datasets.utils.py_utilsr   r   rI   rh   r   r"   r   rR   r    r   <module>r$     s     				   " " " " " " " " " # # # # # #                   4 4 4 4 4 4 < < < < < < > > > > > > G G G G G G C C C C C C C C 7 7 7 7 7 7 D D D D D D D D . . . . . . 
H		   l! l! l! l! l! l! l! l!^  T y) y) y) y) y)- y) y) y) y) y)r    