
    :'aE=                        d dl mZmZ d dlmZmZmZmZmZ 	 d dl	m
Z
 n# e$ r eedf         Z
Y nw xY wd dlmZmZmZ d dlmZ d dlmZmZ d dlmZ d d	lZd d
lmZmZmZmZmZmZ d dl m!Z!m"Z"m#Z#m$Z$  ej%        d          Z&e&'                    ej(                    ej)                    Z*e*+                     ej,        d                     e&-                    e*           	 	 	 	 	 	 	 d"de.de/de/de0dee         dee         de1de1defdZ2	 	 	 	 	 	 	 d"dede/de/de0dee         dee         de1de1defdZ3	 	 	 	 	 	 	 d"de
de/de/de0dee         dee         de1de1defd Z4d#de
de/de/de0dee         dee         de1defd!Z5d	S )$    )splitextbasename)ListBinaryIOOptionalSetUnion)PathLikezos.PathLike[str])TOO_SMALL_SEQUENCETOO_BIG_SEQUENCEIANA_SUPPORTED)
mess_ratio)CharsetMatchesCharsetMatch)warnN)any_specified_encodingis_multi_byte_encodingidentify_sig_or_bomshould_strip_sig_or_bomis_cp_similar	iana_name)coherence_ratioencoding_languagesmb_encoding_languagesmerge_coherence_ratioscharset_normalizerz)%(asctime)s | %(levelname)s | %(message)s      皙?TF	sequencessteps
chunk_size	thresholdcp_isolationcp_exclusionpreemptive_behaviourexplainreturnc                 x   |s%t                               t          j                   n$t                               t          j                   t          |           }|dk    r<t                               d           t          t          | dddg d          g          S |;t                               dd		                    |                     d
 |D             }ng }|;t                               dd		                    |                     d |D             }ng }|||z  k    r!t                               d|||           d}|}|dk    r||z  |k     rt          ||z            }t          |           t          k     }	t          |           t          k    }
|	r"t          d                    |                     g }|du rt          |           nd}|0|                    |           t                               d|           t%                      }g }g }d}d}d}d}d}t                      }t'          |           \  }}|>|                    |           t                               dt          |          |           |                    d           d|vr|                    d           |t(          z   D ]}|r||vr
|r||v r||v r|                    |           d}||k    }|ot-          |          }|dv r |du rt                               d|           h	 t/          |          }n2# t0          t2          f$ r t                               d|           Y w xY w	 |
rS|du rOt7          |du r| dt          d                   n#| t          |          t          d                   |           n,t7          |du r| n| t          |          d         |          }n# t8          $ rP}t                               d|t7          |                     |                    |           |s|dz  }Y d}~d}~wt:          $ r  |                    |           |s|dz  }Y w xY wd}|D ]}t=          ||          rd} n|rt                               d||           t?          |du rdnt          |          |t          ||z                      } |o|duot          |          |k     }!|!rt                               d|           t          t          |           dz            }"|"dk     rd}"d}#g }$g }%| D ]}&| |&|&|z            }'|r	|du r||'z   }'|'                     |d          }(|$                    |(           |%                    tC          |(|                     |%d          |k    r|#dz  }#|#|"k    s|r|du r n|%r tE          |%          t          |%          z  })nd})|)|k    s|#|"k    r||                    |           |s|dz  }t                               d!||#tG          |)d"z  d#$                     |dd|fv r(t          | ||dg |          }*||k    r|*}n|dk    r|*}n|*}t                               d%|tG          |)d"z  d#$                     |stI          |          }+ntK          |          }+|+r;t                               d&                    |t7          |+                               g },|$D ]?}(tM          |(d'|+rd(	                    |+          nd          }-|,                    |-           @tO          |,          }.|.r.t                               d)                    |.|                     |                    t          | ||)||.|                     ||ddfv r9|)d'k     r3t                               d*|           t          ||         g          c S ||k    r3t                               d+|           t          ||         g          c S |d          j(        r't                               d,|||         j)                   t          |          dk    r|s|s|rt                               d-           |r6t                               d.|j*                   |                    |           nw|r||r@|j+        |j+        k    r0t                               d/           |                    |           n1|r/t                               d0           |                    |           |S )1aD  
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
    By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
    but never take it for granted. Can improve the performance.

    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
    r   zXGiven content is empty, stopping the process very early, returning empty utf_8 str matchutf_8g        F Nz`cp_isolation is set. use this flag for debugging purpose. limited list of encoding allowed : %s.z, c                 .    g | ]}t          |d           S Fr   .0cps     Z/mounts/lovelace/software/anaconda3/lib/python3.11/site-packages/charset_normalizer/api.py
<listcomp>zfrom_bytes.<locals>.<listcomp>N   "    DDD	"e,,DDD    zacp_exclusion is set. use this flag for debugging purpose. limited list of encoding excluded : %s.c                 .    g | ]}t          |d           S r-   r.   r/   s     r2   r3   zfrom_bytes.<locals>.<listcomp>W   r4   r5   z^override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.   z>Trying to detect encoding from a tiny portion of ({}) byte(s).Tz@Detected declarative mark in sequence. Priority +1 given for %s.zIDetected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.ascii>   utf_16utf_32z[Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.z2Encoding %s does not provide an IncrementalDecoderg    A)encodingz9Code page %s does not fit given bytes sequence at ALL. %szW%s is deemed too similar to code page %s and was consider unsuited already. Continuing!zpCode page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.      ignore)errorszc%s was excluded because of initial chaos probing. Gave up %i time(s). Computed mean chaos is %f %%.d      )ndigitsz=%s passed initial chaos probing. Mean measured chaos is %f %%z&{} should target any language(s) of {}g?,z We detected language {} using {}z0%s is most likely the one. Stopping the process.z[%s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.z:Using %s code page we detected the following languages: %szONothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.z#%s will be used as a fallback matchz&utf_8 will be used as a fallback matchz&ascii will be used as a fallback match),loggersetLevelloggingCRITICALINFOlenwarningr   r   joinintr   r   r   formatr   appendinfosetr   r   addr   r   ModuleNotFoundErrorImportErrordebugstrUnicodeDecodeErrorLookupErrorr   rangedecoder   sumroundr   r   r   r   	languages
_languagesr;   fingerprint)/r    r!   r"   r#   r$   r%   r&   r'   lengthis_too_small_sequenceis_too_large_sequenceprioritized_encodingsspecified_encodingtestedtested_but_hard_failuretested_but_soft_failurefallback_asciifallback_u8fallback_specifiedsingle_byte_hard_failure_countsingle_byte_soft_failure_countresultssig_encodingsig_payloadencoding_ianadecoded_payloadbom_or_sig_availablestrip_sig_or_bomis_multi_byte_decoderesimilar_soft_failure_testencoding_soft_failedr_multi_byte_bonusmax_chunk_gave_upearly_stop_count	md_chunks	md_ratiosicut_sequencechunkmean_mess_ratiofallback_entrytarget_languages	cd_ratioschunk_languagescd_ratios_mergeds/                                                  r2   
from_bytesr      s   2  &())))%%%^^F{{qrrr 	
 
 	
  @yy..	0 	0 	0 ED|DDD6IIl##	% 	% 	% ED|DDD*u$%%l:v	' 	' 	' 
qyyVe^j00%((
	NN-??	NN.>> ^MTTU[\\]]]>RVZ>Z>Z/	:::`d%$$%7888VXjkkkUUF  NK%&"%&"G 3I > >L+$$\222_adepaqaqs  	A  	A  	A  )))+++$$W---.~= D D 	M== 	M\99F""

=!!!+}</Z4KM4Z4Z0005IU5R5RKKu  xE  F  F  F	$:=$I$I!!#[1 	 	 	LLM}]]]H		$ 	)>%)G)G-=-F-FIjs4yyj))IVYZeVfVfgjkogpgpVpLq*    
 #&!1U!:!:II	#kJZJZJ[J[@\*# # # " 	 	 	NNVXegjklgmgmnnn#**=999( 4.!3.HHHH 	 	 	#**=999( 4.!3.H		 %*!$; 	 	 ],@AA ,0) % 	NNt  wD  FZ  [  [  [%..AAC4D4D
 
 1r_D5PrUXYhUiUilrUr 	[KK  K  MZ  [  [  [B!,,q   !		 	 	A$Qq:~%56L# 8(8E(A(A*<7 ''h'GGEU###    }	)) A%  $555;O5TdhmTmTm 	!!)nns9~~=OO Oi''+;?P+P+P#**=999( 4.!3.NN ;(+ 3!6BBB	D D D '3E FFF!-!#" " !$666)7&&"g--%3NN"0KK/C'333	
 	
 	
 % 	D1-@@4]CC 	oKK@GGWZ[kWlWlmmnnn	 	 	E-eSXh:r#((CS:T:T:TnrssO    2)<< 	dKK:AABRTabbccc$  		
 		
 		
 /'BBBY\G\G\KKJMZZZ!'(     L((KKm   "'(     2;  	KKL&1   7||q 	n. 	n,> 	nNNlmmm 	+NN@BTB]^^^NN-.... 	+n4+4+JaeseJJNNCDDDNN;'''' 	+NNCDDDNN>***Ns2    L00,MM#BO%%
Q'/AP::)Q'&Q'fpc           
      R    t          |                                 |||||||          S )z
    Same thing than the function from_bytes but using a file pointer that is already ready.
    Will not close the file pointer.
    )r   read)r   r!   r"   r#   r$   r%   r&   r'   s           r2   from_fpr   b  s6     
			 	 	r5   pathc                     t          | d          5 }t          ||||||||          cddd           S # 1 swxY w Y   dS )z
    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
    Can raise IOError.
    rbN)openr   )	r   r!   r"   r#   r$   r%   r&   r'   r   s	            r2   	from_pathr   |  s     
dD		 tRr5*i|Uikrsst t t t t t t t t t t t t t t t t ts   488c           
      Z   t          | ||||||          }t          |           }t          t          |                    }	t	          |          dk    r"t          d                    |                    |                                }
|	dxx         d|
j        z   z  cc<   t          d                    | 
                    |d                    |	                              d          5 }|                    |
                                           ddd           n# 1 swxY w Y   |
S )zi
    Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
    r   z;Unable to normalize "{}", no encoding charset seems to fit.-z{}r+   wbN)r   r   listr   rJ   IOErrorrN   bestr;   r   replacerL   writeoutput)r   r!   r"   r#   r$   r%   r&   rm   filenametarget_extensionsresultr   s               r2   	normalizer     s\     G ~~HXh//00
7||qSZZ[cddeee\\^^FaC&/11	dkk$,,x9J1K1KLLMMt	T	T 
XZ
MMOO	
 	
 	

 
 
 
 
 
 
 
 
 
 
 
 
 
 

 Ms   ,(D  D$'D$)r   r   r   NNTF)r   r   r   NNT)6os.pathr   r   typingr   r   r   r   r	   osr
   rT   rV   charset_normalizer.constantr   r   r   charset_normalizer.mdr   charset_normalizer.modelsr   r   warningsr   rG   charset_normalizer.utilsr   r   r   r   r   r   charset_normalizer.cdr   r   r   r   	getLoggerrE   rF   DEBUGStreamHandlerhandlersetFormatter	Formatter
addHandlerbytesrM   floatboolr   r   r   r    r5   r2   <module>r      s   & & & & & & & & 7 7 7 7 7 7 7 7 7 7 7 7 7 7. . . .S,,-HHH. ] \ \ \ \ \ \ \ \ \ , , , , , , B B B B B B B B       6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 t t t t t t t t t t t t		/	0	0    
'

!
!   &W&'RSS T T T   '   
 "&"&%)D DDD D 	D
 3iD 3iD #D D D D D DR
 "&"&%)   	
 3i 3i #     8 "&"&%)t ttt t 	t
 3it 3it #t t t t t t$ H S # PU lpqtlu   MQ  RU  MV   uy   EQ      s    11