
    +gd3Y                         d Z ddlZddlmZ ddlmZ ddlmZ ddlZ	ddl
ZddlmZ 	 ddlmZ n# e$ r dZY nw xY wdd	lmZ d
 Zd Zd Zd Z	 ddZd Z G d d          Z G d d          Zd ZdS )zTF-specific utils import.    N)partial)ceil)uuid4)get_context)SharedMemory   )configc                    t           j        rdd l}nt          d          | d         }i }|                                D ]\  }t          |t          j                  r$t          j        fd| D                       |<   Ct          ||j	                  r%|                    fd| D                       |<   }t          j
        fd| D                       |<   |S )Nr   FCalled a Tensorflow-specific function but Tensorflow is not installed.c                      g | ]
}|         S  r   .0fks     7lib/python3.11/site-packages/datasets/utils/tf_utils.py
<listcomp>z)minimal_tf_collate_fn.<locals>.<listcomp>-        8 8 8!1 8 8 8    c                      g | ]
}|         S r   r   r   s     r   r   z)minimal_tf_collate_fn.<locals>.<listcomp>/   r   r   c                      g | ]
}|         S r   r   r   s     r   r   z)minimal_tf_collate_fn.<locals>.<listcomp>1   r   r   )r	   TF_AVAILABLE
tensorflowImportErroritems
isinstancenpndarraystackTensorarray)featurestffirstbatchvr   s        @r   minimal_tf_collate_fnr'   #   s    dbcccQKEE : :1a$$ 	:x 8 8 8 8x 8 8 899E!HH29%% 	:xx 8 8 8 8x 8 8 899E!HHx 8 8 8 8x 8 8 899E!HHLr   c                 H    t          |           }d|v r|d         |d<   |d= |S )Nlabellabels)r'   )r"   r%   s     r   #minimal_tf_collate_fn_with_renamingr+   5   s3    !(++E%.h'NLr   c                 "   t           j                            |           rt          | j                  S t           j                            |           p=t           j                            |           pt           j                            |           S N)patypesis_listis_numeric_pa_type
value_type
is_integeris_floating
is_decimal)pa_types    r   r1   r1   =   sl    	x   6!'"45558w''h28+?+?+H+HhBHL_L_`gLhLhhr   c                    ddl m}m}m} ddlm} t          | |          rt          | j                  S t          | t                    rt          | d                   S t          | |          rt           |             j                  S t          | |          rt           |                       S t          | |          rdS dS )Nr   )
ClassLabelSequenceValue)_ArrayXDr   TF) r8   r9   r:   features.featuresr;   r   is_numeric_featurefeaturelistr1   storage_dtype)r?   r8   r9   r:   r;   s        r   r>   r>   C   s    ..........,,,,,,'8$$ !'/222	GT	"	" 	!'!*---	GX	&	& !''))"9:::	GU	#	# !'')),,,	GZ	(	( tur   Fc                    t          j        t          j        |           dk              r|| d         | d         dz            n||           fd                                D             t	          t                                                    d                   }fdt          |          D              |fi ||rQi }|                                D ]9\  }	}
t          j        |	                   }|	                    |
          }|||	<   :n`g }|                                D ]I\  }	}
t          j        |	                   }|	                    |
          }|
                    |           J|S )N   r   c                 ,    i | ]\  }}|v s|d v ||S ))r)   	label_idsr*   r   )r   keyvaluecols_to_retains      r   
<dictcomp>z np_get_batch.<locals>.<dictcomp>_   s?     
 
 
Un$$/O(O(O (O(O(Or   c                 R    g | ]"fd                                  D             #S )c                 (    i | ]\  }}||         S r   r   )r   rG   rH   is      r   rJ   z+np_get_batch.<locals>.<listcomp>.<dictcomp>g   s#    <<<
Uc58<<<r   r   )r   rM   r%   s    @r   r   z np_get_batch.<locals>.<listcomp>g   s7    YYY<<<<ekkmm<<<YYYr   )r   alldiffr   lenr@   valuesranger!   astypeappend)indicesdatasetrI   
collate_fncollate_fn_argscolumns_to_np_typesreturn_dictactual_size	out_batchcol
cast_dtyper!   r%   s     `         @r   np_get_batchr`   U   s    
vbgg!#$$ !
WR[1_45 !
 
 
 
#kkmm
 
 
 d5<<>>**1-..KYYYYeKFXFXYYYEJu0000E $	288:: 	# 	#OCHU3Z((ELL,,E"IcNN		# 	288:: 	$ 	$OCHU3Z((ELL,,EU####r   c	           	      `   t           j        rddlnt          d          t	          t
          | |||d                                                  dj                  g          fd            }	j	        j
                            t          j        t          |           t          j                            }
|r"|
                    t          |                     }
|
                    ||	                              |	          }
fd
}|
                    |          S )a   Create a tf.data.Dataset from the underlying Dataset. This is a single-process method - the multiprocess
    equivalent is multiprocess_dataset_to_tf.

            Args:
                dataset (`Dataset`): Dataset to wrap with tf.data.Dataset.
                cols_to_retain (`List[str]`): Dataset column(s) to load in the
                    tf.data.Dataset. It is acceptable to include column names that are created by the `collate_fn` and
                    that do not exist in the original dataset.
                collate_fn(`Callable`): A function or callable object (such as a `DataCollator`) that will collate
                    lists of samples into a batch.
                collate_fn_args (`Dict`): A  `dict` of keyword arguments to be passed to the
                    `collate_fn`. Can be empty.
                columns_to_np_types (`Dict[str, np.dtype]`): A `dict` mapping column names to numpy dtypes.
                output_signature (`Dict[str, tf.TensorSpec]`): A `dict` mapping column names to
                    `tf.TensorSpec` objects.
                shuffle(`bool`): Shuffle the dataset order when loading. Recommended True for training, False for
                    validation/evaluation.
                batch_size (`int`): Size of batches to load from the dataset.
                drop_remainder(`bool`, default `None`): Drop the last incomplete batch when loading. If not provided,
                    defaults to the same setting as shuffle.

            Returns:
                `tf.data.Dataset`
    r   Nr   F)rW   rI   rX   rY   rZ   r[   )input_signaturec                                          | gfd                                D                       fdt                                                    D             S )Nc                 D    g | ]}j                             |          S r   )dtypesas_dtype)r   dtyper#   s     r   r   z9dataset_to_tf.<locals>.fetch_function.<locals>.<listcomp>   s)    VVV")$$U++VVVr   )inpToutc                 (    i | ]\  }}||         S r   r   )r   rM   rG   outputs      r   rJ   z9dataset_to_tf.<locals>.fetch_function.<locals>.<dictcomp>   s#    SSS61cVAYSSSr   )numpy_functionrR   	enumeratekeys)rV   rk   rZ   	getter_fnr#   s    @r   fetch_functionz%dataset_to_tf.<locals>.fetch_function   s|    ""	VVVV9L9S9S9U9UVVV	 # 
 
 TSSSY7J7O7O7Q7Q-R-RSSSSr   )rg   )drop_remainderc                 F    fd|                                  D             S )Nc                 Z    i | ]'\  }}|                     ||         j                  (S r   )ensure_shapeshape)r   rG   valoutput_signaturer#   s      r   rJ   z8dataset_to_tf.<locals>.ensure_shapes.<locals>.<dictcomp>   s7    jjj83PSR__S*:3*?*EFFjjjr   rN   )
input_dictrw   r#   s    r   ensure_shapesz$dataset_to_tf.<locals>.ensure_shapes   s-    jjjjjWaWgWgWiWijjjjr   )r	   r   r   r   r   r`   function
TensorSpecint64dataDatasetfrom_tensor_slicesr   arangerQ   shuffler%   map)rW   rI   rX   rY   rZ   rw   r   
batch_sizerq   rp   
tf_datasetry   ro   r#   s       ``      @@r   dataset_to_tfr   z   sg   F  dbccc%'/  I [["--bh"?"?!@[AAT T T T T T BAT 33BIc'llRTRZ4[4[4[\\J 6''G55
!!*^!LLPPQ_``Jk k k k k k >>-(((r   c                   ,    e Zd Zd Zd Zd Zd Zd ZdS )SharedMemoryContextc                 "    g | _         g | _        d S r-   )created_shmsopened_shmsselfs    r   __init__zSharedMemoryContext.__init__   s    r   c                     t          t          |          ||          }|r| j                            |           n| j                            |           |S )N)sizenamecreate)r   intr   rU   r   )r   r   r   r   shms        r   get_shmzSharedMemoryContext.get_shm   s\    D		VDDD 	)$$S)))) ##C(((
r   c                     |                      |t          j        |          t          j        |          j        z  |          }t          j        |||j                  S )N)r   r   r   )rg   buffer)r   r   prodrg   itemsizer   buf)r   r   ru   rg   r   r   s         r   	get_arrayzSharedMemoryContext.get_array   sK    ll275>>BHUOO<T+T]clddz%uSW====r   c                     | S r-   r   r   s    r   	__enter__zSharedMemoryContext.__enter__       r   c                     | j         D ]*}|                                 |                                 +| j        D ]}|                                 d S r-   )r   closeunlinkr   )r   exc_type	exc_value	tracebackr   s        r   __exit__zSharedMemoryContext.__exit__   s[    $ 	 	CIIKKKJJLLLL# 	 	CIIKKKK	 	r   N)__name__
__module____qualname__r   r   r   r   r   r   r   r   r   r      s_            > > >      r   r   c                   L    e Zd Zd Zd Zd Zed             Zed             ZdS )NumpyMultiprocessingGeneratorc                 `    | _         | _        | _        | _        d |                                D              _         fd|                                D              _        | _        | _        | _	        |	 _
        |
 _         fd|                                D              _        d S )Nc                 L    g | ]!\  }}|t           j        t           j        fv |"S r   )r   unicode_str_)r   r^   rg   s      r   r   z:NumpyMultiprocessingGenerator.__init__.<locals>.<listcomp>   s4    tttzsETY^`^ikmkr]sTsTssTsTsTsr   c                 V    i | ]%\  }}||j         vr|nt          j        d           &S )U1)string_columnsr   rg   )r   r^   rg   r   s      r   rJ   z:NumpyMultiprocessingGenerator.__init__.<locals>.<dictcomp>   sH     $
 $
 $
U #T%888bhtnn$
 $
 $
r   c                     i | ]D\  }}||j         vrt          |j        j                  nt          |j        j                  d z   ES rC   )r   r   ru   rank)r   r^   specr   s      r   rJ   z:NumpyMultiprocessingGenerator.__init__.<locals>.<dictcomp>  sb     !
 !
 !
T D4G)G)GTZ_%%%SQUQ[Q`MaMadeMe!
 !
 !
r   )rW   rI   rX   rY   r   r   rZ   rw   r   r   rq   num_workerscolumns_to_ranks)r   rW   rI   rX   rY   rZ   rw   r   r   rq   r   s   `          r   r   z&NumpyMultiprocessingGenerator.__init__   s     ,$.tt5H5N5N5P5Pttt$
 $
 $
 $
17799$
 $
 $
  !1$,&!
 !
 !
 !
-3355!
 !
 !
r   c           
   #   D   K   t           j        t          t          t	           j                   j        z                                }                      j         j         j        | j	                  \  }}}t          d          g g }g }fdt          |          D             }fdt          |          D             } j         j         j         j         j         j         j        d}	t%                      5 t          |          D ]t'          t)                                }
d d|
                                fd j                                        D             }|                    |           |         }|k    r||}nd }|||         |         d|	}                     j        |d	
          }|                                 |                    |           d}|sht          |          D ]T|                             d          st7          d          |                                          |         }t;          d |                                D                       rd	} nt%                      5  fd|                                D             }d |                                D             } j        D ]G}||                             d||         j         d                    !                    d          ||<   H	 d d d            n# 1 swxY w Y   |V  |         "                                 V|h|D ]}|#                                 	 d d d            d S # 1 swxY w Y   d S )Nspawnc                 8    g | ]}                                 S r   Eventr   _ctxs     r   r   z:NumpyMultiprocessingGenerator.__iter__.<locals>.<listcomp>  s!    FFFaciikkFFFr   c                 8    g | ]}                                 S r   r   r   s     r   r   z:NumpyMultiprocessingGenerator.__iter__.<locals>.<listcomp>  s!    GGGqsyy{{GGGr   )rW   rI   rX   rY   rZ   r   r   datasets_tf_worker_r   c           	      l    i | ]0\  }}|                      d | d|ft          j        d          1S )r   _shapeTru   rg   r   r   r   r|   r   r^   r   shm_ctxworker_names      r   rJ   z:NumpyMultiprocessingGenerator.__iter__.<locals>.<dictcomp>#  s_     ' ' '!T **k+G+GC+G+G+GPTw^`^fos*tt' ' 'r   )r   rV   extra_batcharray_ready_eventarray_loaded_eventT)targetkwargsdaemonF<   )timeoutzData loading worker timed out!c              3   F   K   | ]}t          j        |d k               V  dS )r   N)r   any)r   ru   s     r   	<genexpr>z9NumpyMultiprocessingGenerator.__iter__.<locals>.<genexpr>A  s0      PP26%!),,PPPPPPr   c           	      v    i | ]5\  }}|                               d | |j        |         d          6S )r   Fr   )r   rZ   )r   r^   ru   batch_shm_ctxrM   namesr   s      r   rJ   z:NumpyMultiprocessingGenerator.__iter__.<locals>.<dictcomp>O  sl     " " " !+U  !8!8#(8 3 3c 3 3&+&*&>s&C',	 "9 " "" " "r   c                 >    i | ]\  }}|t          j        |          S r   )r   copy)r   r^   arrs      r   rJ   z:NumpyMultiprocessingGenerator.__iter__.<locals>.<dictcomp>Z  s&    !S!S!SS#rws||!S!S!Sr   UrD   )$minr   r   r   rQ   rW   r   distribute_batchesrq   r   r   rS   rI   rX   rY   rZ   r   r   r   strr   rU   r   Processworker_loopstartwaitTimeoutErrorclearr   rR   viewru   squeezesetjoin)r   r   per_worker_batchesfinal_batchfinal_batch_workershape_arraysworkersarray_ready_eventsarray_loaded_events	base_argsworker_random_idworker_shape_arraysworker_indicesfinal_batch_argworker_kwargsworkerend_signal_receivedarray_shapesarrays
string_colr   r   rM   r   r   r   s   `                   @@@@@@r   __iter__z&NumpyMultiprocessingGenerator.__iter__  s     $*CS5F5F5X0Y0Y,Z,Z[[>B>U>UL$/4+>T\?
 ?
;K); '""FFFF53E3EFFFGGGGE+4F4FGGG |"1/#3#'#; $ 5"1
 
	 !"" H	g;'' ' '#&uww<< JAJJ8HJJ[)))' ' ' ' '%)%:%@%@%B%B' ' '# ##$7888!3A!6***{/F&1OO&*O#.-#2);A)>*=a*@! !  ! D,<][_``v&&&&"') &1{++ %1 %1A-a055b5AA M*+KLLL&q)//111#/?LPP,:M:M:O:OPPPPP  /3+ -.. -" " " " " " " /;.@.@.B.B" " " "T!SFLLNN!S!S!S*.*=  J &z 2 7 78ZF:<N<TUW<X8Z8Z [ [ c cdf g g #:..              & !LLL'*..0000M * &1R "  OH	 H	 H	 H	 H	 H	 H	 H	 H	 H	 H	 H	 H	 H	 H	 H	 H	 H	s8   
FN"BM 4N MNM?NNNc                     | S r-   r   r   s    r   __call__z&NumpyMultiprocessingGenerator.__call__g  r   r   c           
          	
 dt           j        d<   t          j        rdd l}nt          d          |j                            g d           
 	f
d}t                      5 	fd|                                D             |D ]} ||           | ||                                           D ]\  }}d|d d <   
	                                 d d d            d S # 1 swxY w Y   d S )	N3TF_CPP_MIN_LOG_LEVELr   r   GPUc           	      4  
 t          | 	
d          }i }t                      5 }                                D ]\  }}||         }|v r0|                    d                              |j        dz             }|j        |         d d <   |                     d| |j        |d          ||<   |||         d d <                                                                     	                                 d d d            d S # 1 swxY w Y   d S )NT)rV   rW   rI   rX   rY   rZ   r[   r   )rD   r   r   )
r`   r   r   r   reshaperu   r   r   r   r   )rV   r%   
out_arraysr   r^   r_   r!   r   r   rX   rY   rI   rZ   rW   r   r   r   s          r   send_batch_to_parentzGNumpyMultiprocessingGenerator.worker_loop.<locals>.send_batch_to_parent  s    -% /$7   E J$&& +- (;'@'@'B'B / /OC!#JEn,, !&

4 0 0 8 8u9L M M+0;L%aaa(&3&=&=&....ek\` '> ' 'JsO */JsOAAA&&!%%'''"'')))"((***%+ + + + + + + + + + + + + + + + + +s   CDDDc           	      l    i | ]0\  }}|                      d | d|ft          j        d          1S )r   r   Fr   r   r   s      r   rJ   z=NumpyMultiprocessingGenerator.worker_loop.<locals>.<dictcomp>  s^       C W&&+'C'C'C'C'CD7Z\Zbkp&qq  r   rD   )
osenvironr	   r   r   r   set_visible_devicesr   r   r   )rW   rI   rX   rY   rZ   r   r   rV   r   r   r   r   r#   r  r%   r^   r!   r   r   s   ````` `  ```     @@r   r   z)NumpyMultiprocessingGenerator.worker_loopj  s    .1
)* 	h#####fggg
	%%b%000	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+B !"" 	$g    !1!7!7!9!9  L
 ! , ,$$U++++&$$[111*0022  
Uaaa!!###	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$s   2A4C33C7:C7c                    t          j        t          |                     }|rt           j                            |           t          |          }|||z  z
  }t          j        ||g          \  }}|st          |          dk    rd }|                    d|          }t          |          }	|	|	|z  z
  }
t          j        ||
g          \  }}|                    d||          }t          j        ||j        d         d          }d |D             }t          t          |                    D ]=}t          j	        ||         ||                             dd          gd          ||<   >|t          |          }nd }|||fS )Nr   rD   rC   )axisc                 8    g | ]}t          j        |d           S r   )r   r   )r   r   s     r   r   zDNumpyMultiprocessingGenerator.distribute_batches.<locals>.<listcomp>  s$    eeebj;;eeer   )
r   r   rQ   randomr   splitr  ru   rS   concatenate)rW   r   rq   r   r   rV   num_samplesincomplete_batch_cutofflast_incomplete_batchnum_batchesfinal_batches_cutofffinal_batchesper_worker_indicesrM   incomplete_batch_worker_idxs                  r   r   z0NumpyMultiprocessingGenerator.distribute_batches  s   )CLL)) 	'Ig&&&'ll #.z1I"J)+'<S;T)U)U&& 	)S!6771<<$(!//"j11'll*kK.GH!#'4H3I!J!J//"k:>>Xgw}Q/?aHHHeeRdeees=))** 	u 	uA$&N4Fq4I=YZK[KcKcdegiKjKj3krs$t$t$tq!! ,*-m*<*<''*.'!#8:UUUr   N)	r   r   r   r   r   r   staticmethodr   r   r   r   r   r   r      s         
  
  
D_ _ _B   E$ E$ \E$N V V \V V Vr   r   c
                    t           j        rddl}
nt          d          t	          | |||||||||	
  
        }|
j        j                            ||          }|r t          t          |           |z            }n,t          t          t          |           |z                      }|                    |
j        j                            |                    S )a[  Create a tf.data.Dataset from the underlying Dataset. This is a multi-process method - the single-process
    equivalent is dataset_to_tf.

            Args:
                dataset (`Dataset`): Dataset to wrap with tf.data.Dataset.
                cols_to_retain (`List[str]`): Dataset column(s) to load in the
                    tf.data.Dataset. It is acceptable to include column names that are created by the `collate_fn` and
                    that do not exist in the original dataset.
                collate_fn(`Callable`): A function or callable object (such as a `DataCollator`) that will collate
                    lists of samples into a batch.
                collate_fn_args (`Dict`): A  `dict` of keyword arguments to be passed to the
                    `collate_fn`. Can be empty.
                columns_to_np_types (`Dict[str, np.dtype]`): A `dict` mapping column names to numpy dtypes.
                output_signature (`Dict[str, tf.TensorSpec]`): A `dict` mapping column names to
                    `tf.TensorSpec` objects.
                shuffle(`bool`): Shuffle the dataset order when loading. Recommended True for training, False for
                    validation/evaluation.
                batch_size (`int`): Size of batches to load from the dataset.
                drop_remainder(`bool`, default `None`): Drop the last incomplete batch when loading. If not provided,
                    defaults to the same setting as shuffle.
                num_workers (`int`): Number of workers to use for loading the dataset. Should be >= 1.

            Returns:
                `tf.data.Dataset`
    r   Nr   )
rW   rI   rX   rY   rZ   rw   r   r   rq   r   )rw   )r	   r   r   r   r   r}   r~   from_generatorr   rQ   r   applyexperimentalassert_cardinality)rW   rI   rX   rY   rZ   rw   r   r   rq   r   r#   data_generatorr   dataset_lengths                 r   multiprocess_dataset_to_tfr    s    J  dbccc2%'/)%  N //Qa/bbJ >S\\Z788T#g,,";<<==BG0CCNSSTTTr   )F)__doc__r  	functoolsr   mathr   uuidr   numpyr   pyarrowr.   multiprocessr   multiprocess.shared_memoryr   r   r<   r	   r'   r+   r1   r>   r`   r   r   r   r  r   r   r   <module>r'     s      				                           $ $ $ $ $ $7777777   LLL        $  i i i  & ej" " " "JF) F) F)R       @mV mV mV mV mV mV mV mV`<U <U <U <U <Us   / 99