
    :'a?              
          d dl mZ d dlmZmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZ  G d d          Z G d de          Z G d	 d
e          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Zdee         dee         de fdZ! ed          d$ded e"d!e de"fd"            Z#d#S )%    )	lru_cache)OptionalList)UNICODE_SECONDARY_RANGE_KEYWORD)is_punctuation	is_symbolunicode_rangeis_accentuatedis_latinremove_accentis_separatoris_cjkis_case_variable	is_hangulis_katakanais_hiraganais_asciiis_thaic                   V    e Zd ZdZdedefdZdeddfdZd	dZe	de
fd            ZdS )
MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterreturnc                     t           )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr   s     Y/mounts/lovelace/software/anaconda3/lib/python3.11/site-packages/charset_normalizer/md.pyeligiblezMessDetectorPlugin.eligible   
     "!    Nc                     t           )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r   r   s     r   feedzMessDetectorPlugin.feed   s
    
 "!r!   c                     t           )zB
        Permit to reset the plugin to the initial state.
        r   r   s    r   resetzMessDetectorPlugin.reset   r    r!   c                     t           )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r   r%   s    r   ratiozMessDetectorPlugin.ratio"   s
     "!r!   r   N)__name__
__module____qualname____doc__strboolr   r#   r&   propertyfloatr(    r!   r   r   r   	   s         
"# "$ " " " ""c "d " " " "" " " " "u " " " X" " "r!   r   c                   X    e Zd Zd ZdedefdZdeddfdZd	dZe	de
fd            ZdS )
 TooManySymbolOrPunctuationPluginc                 L    d| _         d| _        d| _        d | _        d| _        d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr%   s    r   __init__z)TooManySymbolOrPunctuationPlugin.__init__-   s0    "# !$(!&+###r!   r   r   c                 *    |                                 S Nisprintabler   s     r   r   z)TooManySymbolOrPunctuationPlugin.eligible5       $$&&&r!   Nc                     | xj         dz  c_         || j        k    rY|dvrUt          |          r| xj        dz  c_        n5|                                du rt          |          r| xj        dz  c_        || _        d S )N   )<>=:/&;{}[],|"F   )r8   r9   r   r6   isdigitr   r7   r   s     r   r#   z%TooManySymbolOrPunctuationPlugin.feed8   s    "111i  HN  7N  7Ni(( (''1,'''""$$--)I2F2F-""a'""$-!!!r!   c                 0    d| _         d| _        d| _        d S Nr   )r6   r8   r7   r%   s    r   r&   z&TooManySymbolOrPunctuationPlugin.resetC   s     "# !r!   c                 ^    | j         dk    rdS | j        | j        z   | j         z  }|dk    r|ndS )Nr           333333?)r8   r6   r7   )r   ratio_of_punctuations     r   r(   z&TooManySymbolOrPunctuationPlugin.ratioH   sE     A%%2 $ 7$:L LPTPee';s'B'B##Jr!   r)   r*   r+   r,   r;   r.   r/   r   r#   r&   r0   r1   r(   r2   r!   r   r4   r4   +   s        , , ,'# '$ ' ' ' '	.c 	.d 	. 	. 	. 	.   
 Ku K K K XK K Kr!   r4   c                   X    e Zd Zd ZdedefdZdeddfdZd	dZe	de
fd            ZdS )
TooManyAccentuatedPluginc                 "    d| _         d| _        d S rT   r8   _accentuated_countr%   s    r   r;   z!TooManyAccentuatedPlugin.__init__T        !"#r!   r   r   c                 *    |                                 S r=   )isalphar   s     r   r   z!TooManyAccentuatedPlugin.eligibleX   s      """r!   Nc                 h    | xj         dz  c_         t          |          r| xj        dz  c_        d S d S NrB   )r8   r
   r^   r   s     r   r#   zTooManyAccentuatedPlugin.feed[   sJ    ")$$ 	)##q(####	) 	)r!   c                 "    d| _         d| _        d S rT   r]   r%   s    r   r&   zTooManyAccentuatedPlugin.reseta   r_   r!   c                 N    | j         dk    rdS | j        | j         z  }|dk    r|ndS )Nr   rV   gffffff?r]   )r   ratio_of_accentuations     r   r(   zTooManyAccentuatedPlugin.ratioe   s<     A%%2 $ 7$:O O(=(E(E$$2Mr!   r)   rY   r2   r!   r   r[   r[   R   s        $ $ $## #$ # # # #)c )d ) ) ) )$ $ $ $ Nu N N N XN N Nr!   r[   c                   X    e Zd Zd ZdedefdZdeddfdZd	dZe	de
fd            ZdS )
UnprintablePluginc                 "    d| _         d| _        d S rT   )_unprintable_countr8   r%   s    r   r;   zUnprintablePlugin.__init__o   s    "# !r!   r   r   c                     dS NTr2   r   s     r   r   zUnprintablePlugin.eligibles       tr!   Nc                 z    |dvr&|                                 du r| xj        dz  c_        | xj        dz  c_        d S )N>   	
FrB   )r?   rj   r8   r   s     r   r#   zUnprintablePlugin.feedv   sS    4449N9N9P9PTY9Y9Y##q(##"r!   c                     d| _         d S rT   )rj   r%   s    r   r&   zUnprintablePlugin.reset{   s    "#r!   c                 @    | j         dk    rdS | j        dz  | j         z  S )Nr   rV      )r8   rj   r%   s    r   r(   zUnprintablePlugin.ratio~   s+     A%%2'!+t/DDDr!   r)   rY   r2   r!   r   rh   rh   m   s        " " "# $    #c #d # # # #
$ $ $ $ Eu E E E XE E Er!   rh   c                   X    e Zd Zd ZdedefdZdeddfdZd	dZe	de
fd            ZdS )
SuspiciousDuplicateAccentPluginc                 0    d| _         d| _        d | _        d S rT   _successive_countr8   _last_latin_characterr%   s    r   r;   z(SuspiciousDuplicateAccentPlugin.__init__   s     !" !%)"""r!   r   r   c                 H    |                                 ot          |          S r=   )ra   r   r   s     r   r   z(SuspiciousDuplicateAccentPlugin.eligible   s!      "":x	':'::r!   Nc                 l   | xj         dz  c_         | j        t          |          rt          | j                  rr|                                r)| j                                        r| xj        dz  c_        t          |          t          | j                  k    r| xj        dz  c_        || _        d S rc   )r8   r{   r
   isupperrz   r   r   s     r   r#   z$SuspiciousDuplicateAccentPlugin.feed   s    "%1i(( 0^D<V-W-W 0$$&& 04+E+M+M+O+O 0**a/** ++}T=W/X/XXX**a/**%."""r!   c                 0    d| _         d| _        d | _        d S rT   ry   r%   s    r   r&   z%SuspiciousDuplicateAccentPlugin.reset   s     !" !%)"""r!   c                 @    | j         dk    rdS | j        dz  | j         z  S )Nr   rV   rQ   )r8   rz   r%   s    r   r(   z%SuspiciousDuplicateAccentPlugin.ratio   s+     A%%2&*d.CCCr!   r)   rY   r2   r!   r   rw   rw      s        * * *;# ;$ ; ; ; ;	/c 	/d 	/ 	/ 	/ 	/* * * *
 Du D D D XD D Dr!   rw   c                   X    e Zd Zd ZdedefdZdeddfdZd	dZe	de
fd            ZdS )
SuspiciousRangec                 0    d| _         d| _        d | _        d S rT   )"_suspicious_successive_range_countr8   _last_printable_seenr%   s    r   r;   zSuspiciousRange.__init__   s     23/ !$(!!!r!   r   r   c                 *    |                                 S r=   r>   r   s     r   r   zSuspiciousRange.eligible   r@   r!   Nc                 2   | xj         dz  c_         |                                st          |          r	d | _        d S | j        	|| _        d S t	          | j                  }t	          |          }t          ||          r| xj        dz  c_        || _        d S rc   )r8   isspacer   r   r	    is_suspiciously_successive_ranger   )r   r   unicode_range_aunicode_range_bs       r   r#   zSuspiciousRange.feed   s    " 	.";"; 	(,D%F$,(1D%F'(ABB'	22+O_MM 	933q833$-!!!r!   c                 0    d| _         d| _        d | _        d S rT   )r8   r   r   r%   s    r   r&   zSuspiciousRange.reset   s      !23/$(!!!r!   c                 T    | j         dk    rdS | j        dz  | j         z  }|dk     rdS |S )Nr   rV   rQ   g?)r8   r   )r   ratio_of_suspicious_range_usages     r   r(   zSuspiciousRange.ratio   sC     A%%2+/+RUV+VZ^Zo*o'*S002..r!   r)   rY   r2   r!   r   r   r      s        ) ) )
'# '$ ' ' ' '.c .d . . . .&) ) ) )
 	/u 	/ 	/ 	/ X	/ 	/ 	/r!   r   c                   X    e Zd Zd ZdedefdZdeddfdZd	dZe	de
fd            ZdS )
SuperWeirdWordPluginc                 v    d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        d S )Nr   F )_word_count_bad_word_count_is_current_word_bad_foreign_long_watchr8   _bad_character_count_buffer_buffer_accent_countr%   s    r   r;   zSuperWeirdWordPlugin.__init__   sG     $)!#(  !$%!$%!!!r!   r   r   c                     dS rl   r2   r   s     r   r   zSuperWeirdWordPlugin.eligible   rm   r!   Nc                 "   |                                 rd                    | j        |g          | _        t          |          r| xj        dz  c_        | j        du rmt          |          du r\t          |          du rKt          |          du r:t          |          du r)t          |          du rt          |          du rd| _        d S | j        sd S |                                st          |          st          |          r| j        r| xj        dz  c_        t!          | j                  }| xj        |z  c_        |dk    r| j        |z  dk    rd| _        |dk    r| j        rd| _        | j        r9| xj        dz  c_        | xj        t!          | j                  z  c_        d| _        d| _        d| _        d| _        d S |d	vr>|                                du r*t-          |          rd| _        | xj        |z  c_        d S d S d S d S )
Nr   rB   FT   rW      r   >   -rC   rE   rD   )ra   joinr   r
   r   r   r   r   r   r   r   r   r   r   r   r   lenr8   r   r   r   rR   r   )r   r   buffer_lengths      r   r#   zSuperWeirdWordPlugin.feed   s    	77DL)#<==DLi(( /))Q.))'500Xi5H5HE5Q5QV\]fVgVgkpVpVpu~  @I  vJ  vJ  NS  vS  vS  Xc  dm  Xn  Xn  rw  Xw  Xw  |G  HQ  |R  |R  V[  |[  |[  `g  hq  `r  `r  v{  `{  `{+/(F| 	F 	&>)#<#< 	&Y@W@W 	&]a]i 	&!--M!!]2!!!!d&?-&OSV&V&V,0)""t'?",0)( 2$$)$$))S->->>)),1)',D$DL()D%%%222y7H7H7J7Je7S7SXabkXlXl7S(,D%LLI%LLLL 327S7S7S7Sr!   c                 h    d| _         d| _        d| _        d| _        d| _        d| _        d| _        d S )Nr   Fr   )r   r   r   r   r   r8   r   r%   s    r   r&   zSuperWeirdWordPlugin.reset
  s?    $)!#(   !$%!!!r!   c                 :    | j         dk    rdS | j        | j        z  S )N
   rV   )r   r   r8   r%   s    r   r(   zSuperWeirdWordPlugin.ratio  s&    r!!2(4+@@@r!   r)   rY   r2   r!   r   r   r      s        
& 
& 
&# $    &c &d & & & &B& & & & Au A A A XA A Ar!   r   c                   \    e Zd ZdZd ZdedefdZdeddfdZd
dZ	e
defd	            ZdS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and can be easily detected.
    Searching for the overuse of '丅' and '丄'.
    c                 "    d| _         d| _        d S rT   _wrong_stop_count_cjk_character_countr%   s    r   r;   zCjkInvalidStopPlugin.__init__!      !"$%!!!r!   r   r   c                     dS rl   r2   r   s     r   r   zCjkInvalidStopPlugin.eligible%  rm   r!   Nc                 t    |dv r| xj         dz  c_         d S t          |          r| xj        dz  c_        d S d S )N)u   丅u   丄rB   )r   r   r   r   s     r   r#   zCjkInvalidStopPlugin.feed(  sZ    &&""a'""F) 	+%%*%%%%	+ 	+r!   c                 "    d| _         d| _        d S rT   r   r%   s    r   r&   zCjkInvalidStopPlugin.reset/  r   r!   c                 :    | j         dk     rdS | j        | j         z  S )N   rV   )r   r   r%   s    r   r(   zCjkInvalidStopPlugin.ratio3  s&    $r))2%(AAAr!   r)   )r*   r+   r,   r-   r;   r.   r/   r   r#   r&   r0   r1   r(   r2   r!   r   r   r     s         
& & &# $    +c +d + + + +& & & & Bu B B B XB B Br!   r   c                   X    e Zd Zd ZdedefdZdeddfdZd	dZe	de
fd            ZdS )
ArchaicUpperLowerPluginc                 h    d| _         d| _        d| _        d| _        d| _        d | _        d| _        d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr8   _last_alpha_seen_current_ascii_onlyr%   s    r   r;   z ArchaicUpperLowerPlugin.__init__<  s?    	/0,-.*340 ! $#'   r!   r   r   c                     dS rl   r2   r   s     r   r   z ArchaicUpperLowerPlugin.eligibleI  rm   r!   Nc                    |                                 ot          |          }|du }|r| j        dk    rt| j        dk    r4|                                du r| j        du r| xj        | j        z  c_        d| _        d| _        d | _        d| _        | xj	        dz  c_	        d| _        d S | j        du rt          |          du rd| _        | j        |                                r| j                                        s-|                                rB| j                                        r)| j        du r| xj        dz  c_        d| _        nd| _        nd| _        | xj	        dz  c_	        | xj        dz  c_        || _        d S )NFr   @   rB   TrQ   )ra   r   r   rR   r   r   r   r   r   r8   r   r~   islower)r   r   is_concerned	chunk_seps       r   r#   zArchaicUpperLowerPlugin.feedL  s    ((**J/?	/J/J E)	 	=AA3r99i>O>O>Q>QUZ>Z>Z_c_w  |A  `A  `A88D<^^8812D.34D0$(D!DI!!Q&!!'+D$F#t++0C0Cu0L0L',D$ ,!!## "(=(E(E(G(G "YM^M^M`M` "eiez  fC  fC  fE  fE "9$$66!;66 %DII $DII!	",,1,, )r!   c                 h    d| _         d| _        d| _        d| _        d | _        d| _        d| _        d S )Nr   FT)r8   r   r   r   r   r   r   r%   s    r   r&   zArchaicUpperLowerPlugin.resetn  s?     !/0,-.*340 $	#'   r!   c                 :    | j         dk    rdS | j        | j         z  S )Nr   rV   )r8   r   r%   s    r   r(   zArchaicUpperLowerPlugin.ratiow  s&     A%%27$:OOOr!   r)   rY   r2   r!   r   r   r   :  s        ( ( (# $     *c  *d  *  *  *  *D( ( ( ( Pu P P P XP P Pr!   r   r   r   r   c                    | |dS | |k    rdS d| v rd|v rdS d| v sd|v rdS |                      d          |                     d          }}|D ]}|t          v r||v r dS | dv r|dv rdS | dv s|dv r
d| v sd|v rdS d	| v sd	|v rd| v sd|v rdS | d
k    s|d
k    rdS d| v sd|v s| dv r|dv rd| v sd|v rdS d| v sd|v rdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFLatin	Emoticons )KatakanaHiraganaCJKHangulzBasic LatinPunctuationForms)splitr   )r   r   keywords_range_akeywords_range_bels        r   r   r     s    /"9t/))u/!!g&@&@uo%%)G)Gu)8)>)>s)C)C_EZEZ[^E_E_&  000!!!55 " 222Jb7b7bu222oIa6a6aO##u'?'?5?""h/&A&AO##u'?'?5m++-/O/O5 	  E_$<$</UmBmBm  sB  F^  s^  s^O++}/O/O5o%%O)C)C54r!   i   )maxsize皙?Fdecoded_sequencemaximum_thresholddebugc                 <   g }t                                           D ]}|                     |                        t          |           }d}|dk     rd}n|dk    rd}nd}t	          | t          d|                    D ]m\  }}	|D ],}
|
                    |          r|
                    |           -|	dk    r	|	|z  dk    s	|	|dz
  k    r!t          d	 |D                       }||k    r nn|r|D ]}t          |j
        |j                   t          |d
          S )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    rV   i       i   r      r   rB   c                     g | ]	}|j         
S r2   )r(   ).0dts     r   
<listcomp>zmess_ratio.<locals>.<listcomp>  s'       !#BH  r!      )r   __subclasses__appendr   zipranger   r#   sumprint	__class__r(   round)r   r   r   	detectorsmd_classlengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr   indexdetectorr   s               r   
mess_ratior     s   
 I&5577 
 
HJJ	
 	
 	
 	
 !""FO||,.))	4,.)),/) 0%62B2BCC  	5! 	) 	)H  ++ )i(((AII%"CCqHHUV\]^V^M^M^! '0   O "333  	 	B   
 	  r!   N)r   F)$	functoolsr   typingr   r   charset_normalizer.constantr   charset_normalizer.utilsr   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r4   r[   rh   rw   r   r   r   r   r.   r/   r   r1   r   r2   r!   r   <module>r      s_         ! ! ! ! ! ! ! ! G G G G G Gr r r r r r r r r r r r r r r r r r r r r r r r r r r r r r r r" " " " " " " "D$K $K $K $K $K'9 $K $K $KNN N N N N1 N N N6E E E E E* E E E2 D  D  D  D  D&8  D  D  DF,/ ,/ ,/ ,/ ,/( ,/ ,/ ,/^@A @A @A @A @A- @A @A @AFB B B B B- B B B>BP BP BP BP BP0 BP BP BPJ-hsm -V^_bVc -hl - - - -` 4/ / / /T /^c / / / / / /r!   