
    h%e_                         d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ ddlmZ  G d de          Zd	S )
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizerstrainers)BPE)NFKC   )BaseTokenizerc                   *    e Zd ZdZ	 	 	 	 	 	 	 d deeeeeef         f                  deeeee	eef         e	eef         f         f                  d	eee
f         d
ededee         dee         f fdZededefd            Zdddgdg dfdeeee         f         dededeeee
f                  dedee         defdZdddgdg ddfdeee         eee                  f         dededeeee
f                  dedee         dedee         fdZ xZS )!SentencePieceBPETokenizerzrSentencePiece BPE Tokenizer

    Represents the BPE algorithm, with the pretokenization used by SentencePiece
    N<unk>   ▁TFvocabmerges	unk_tokenreplacementadd_prefix_spacedropoutfuse_unkc           	         |$|"t          t          |||||                    }nt          t          |||                    }|                    t          |                    #|                    t          |          g           t                      |_        t          j        ||          |_	        t          j        ||          |_        d||||d}	t                                          ||	           d S )N)r   r   r   )r   r   SentencePieceBPE)modelr   r   r   r   )r
   r   token_to_idstradd_special_tokensr   
normalizerr   	Metaspacepre_tokenizerr   decodersuper__init__)selfr   r   r   r   r   r   r   	tokenizer
parameters	__class__s             Llib/python3.11/site-packages/tokenizers/implementations/sentencepiece_bpe.pyr(   z"SentencePieceBPETokenizer.__init__   s    !3!#eVWPYdl"m"m"mnnII!#gU]"^"^"^__I  Y00<((#i..)9:::#vv	"0":{eu"v"v"v	$.;Yijjj	 ("& 0
 

 	J/////    vocab_filenamemerges_filenamec                 N    t          j        | |          \  }}t          ||fi |S )N)r   	read_filer   )r/   r0   kwargsr   r   s        r-   	from_filez#SentencePieceBPETokenizer.from_file0   s/    noFFv(AA&AAAr.   i0u     i  files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetshow_progressc                     t          j        ||||||          }t          |t                    r|g}| j                            ||           dS )z%Train the model using the given filesr7   r8   r9   r:   r;   r<   )trainerN)r   
BpeTrainer
isinstancer!   
_tokenizertrain)	r)   r6   r7   r8   r9   r:   r;   r<   r?   s	            r-   rC   zSentencePieceBPETokenizer.train5   si     %!'))-'
 
 
 eS!! 	GEeW55555r.   iteratorlengthc	                 t    t          j        ||||||          }	| j                            ||	|           dS )z(Train the model using the given iteratorr>   )r?   rE   N)r   r@   rB   train_from_iterator)
r)   rD   r7   r8   r9   r:   r;   r<   rE   r?   s
             r-   rG   z-SentencePieceBPETokenizer.train_from_iteratorM   sa     %!'))-'
 
 
 	++ 	, 	
 	
 	
 	
 	
r.   )NNr   r   TNF)__name__
__module____qualname____doc__r   r   r!   r   intr   r	   boolfloatr(   staticmethodr4   r   rC   r   rG   __classcell__)r,   s   @r-   r   r   
   sl         7;OS,3 !%#'#(0 0c4S>1230 sDsCx%S/)I$JJKL0 j)	0
 0 0 %0 4.0 0 0 0 0 0@ B# B B B B \B  8?y"&("6 6S$s)^$6 6 	6
 U3
?346 6 s)6 6 6 6 66  8?y"&(" $
 
x'>>?
 
 	

 U3
?34
 
 s)
 
 
 
 
 
 
 
 
 
r.   r   N)typingr   r   r   r   r   r   
tokenizersr	   r
   r   r   r   tokenizers.modelsr   tokenizers.normalizersr   base_tokenizerr   r    r.   r-   <module>rW      s    ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? P P P P P P P P P P P P P P ! ! ! ! ! ! ' ' ' ' ' ' ) ) ) ) ) )\
 \
 \
 \
 \
 \
 \
 \
 \
 \
r.   