
    I_B                     `   d dl Z d dlZd dlmZ d dlmZ d dlZd dlmZm	Z	m
Z
mZ  e j        e          Z edddg          Z edg d	          Zh d
ZdhZh dZh dZddhZddhZdhZddhZ ed          ZddgZd Zd Z G d de          Z G d de          Z  G d de          Z!dS )    N)
namedtuple)time)ParseResultquoteurlparse
urlunparseRulefieldvalueRequestRate)requestsseconds
start_timeend_time>   disalowdiasllowdisallawdisallowdissalow	dissallowallow>   
user agent
user-agent	useragent>   site-mapsitemapsitemapszcrawl-delayzcrawl delayzrequest-ratezrequest ratehost*$0123456789ABCDEFabcdefProtegoc           
          t          | t          v | t          v | t          v | t          v | t
          v | t          v | t          v g          S N)any_DISALLOW_DIRECTIVE_ALLOW_DIRECTIVE_USER_AGENT_DIRECTIVE_SITEMAP_DIRECTIVE_CRAWL_DELAY_DIRECTIVE_REQUEST_RATE_DIRECTIVE_HOST_DIRECTIVE)r
   s    'lib/python3.11/site-packages/protego.py_is_valid_directive_fieldr.      sV    ,,))..++//00(* + + +    c                 :    |                      d          r| S d| z   S )N/)
startswith)patterns    r-   _enforce_pathr4   )   s'    # =r/   c                   $    e Zd ZdZd Zd Zd ZdS )_URLPatternz.Internal class which represents a URL pattern.c                 B   || _         t          |          | _        d| j         v | _        | j                             d          | _        | j        r-| j         d | j                             d                   | _        n| j        r| j         d d         | _        d| _	        d S )Nr   r    F)
_patternlenpriority_contains_asteriskendswith_contains_dollarfind_pattern_before_asterisk_pattern_before_dollar_pattern_compiledselfr3   s     r-   __init__z_URLPattern.__init__3   s    G"%"6 $ 6 6s ; ;" 	=,0M:R4=;M;Mc;R;R:R,SD))" 	=*.-*<D'!&r/   c                    | j         r| j                            |          S | j        s,| j        s|                    | j                  S || j        k    S |                    | j                  sdS |                     | j                  | _        t          j
        | j                  | _        d| _         | j                            |          S )zDRetun True if pattern matches the given URL, otherwise return False.FT)rB   r9   matchr<   r>   r2   rA   r@   _prepare_pattern_for_regexrecompile)rD   urls     r-   rG   z_URLPattern.match@   s     ! 	,=&&s+++& 	6( 5~~dm444 $555~~d;<< 	577FF
4=11!%}""3'''r/   c                    t          j        dd|          }t          j        d|          }t          |          D ]7\  }}|t          vrt          j        |          ||<   &||         dk    rd||<   8d                    |          }|S )z:Return equivalent regex pattern for the given URL pattern.z\*+r   z(\*|\$$)z.*? )rI   subsplit	enumerate
_WILDCARDSescapejoin)rD   r3   sindexsubstrs        r-   rH   z&_URLPattern._prepare_pattern_for_regexV   s    &g..H['**&q\\ 	! 	!ME6Z''9V,,%5S %''!**r/   N)__name__
__module____qualname____doc__rE   rG   rH    r/   r-   r6   r6   0   sG        88' ' '( ( (,
 
 
 
 
r/   r6   c                       e Zd ZdZd Zd ZddZd Zd Zd	 Z	d
 Z
d Zd Zd Zed             Zej        d             Zed             Zej        d             ZdS )_RuleSetz3Internal class which stores rules for a user agent.c                 L    d | _         g | _        d | _        d | _        || _        d S r$   )
user_agent_rules_crawl_delay	_req_rate_parser_instance)rD   parser_instances     r-   rE   z_RuleSet.__init__f   s-      /r/   c                     |                                                                 }| j        dk    rdS | j        |v rt          | j                  S dS )zReturn matching score.r      r   )striplowerr_   r:   )rD   	robotnames     r-   
applies_toz_RuleSet.applies_tom   sQ    OO%%++--	?c!!1?i''t'''qr/   rM   replacec                    d|vr|S d }d |D             }|                     d          }|d                             d          |d<   t          dt          |                    D ]}t          ||                   dk    rt	          ||         dd                                       t                    rc||         dd                                         }||         dd         }||vr% ||          |                    d          z   ||<   ||z   ||<   d	||                             d          z   ||<   d
                    |          	                    d|          S )z9Replace %xy escapes by their single-character equivalent.%c                     t           j        rt          t          | d                    S t                              |           S )z6Replaces a %xx escape with equivalent binary sequence.   )sixPY2chrintbytesfromhex)hs    r-   hex_to_bytez&_RuleSet._unquote.<locals>.hex_to_byte{   s2    w '3q"::&==###r/   c                 R    h | ]$}d                      t          |                    %S )z{:02X})formatord).0cs     r-   	<setcomp>z$_RuleSet._unquote.<locals>.<setcomp>   s*    :::a(//#a&&)):::r/   r   utf-8rf      N   %r/   )
rO   encoderanger:   setissubset_HEX_DIGITSupperrS   decode)	rD   rK   ignoreerrorsrw   partsihexcodeleftovers	            r-   _unquotez_RuleSet._unquotev   sj   c>>J	$ 	$ 	$ ;:6:::		#8??7++aq#e**%% 	7 	7A58}}!!uQx|$$--k:: 6#Ahrrl0022G$Qx|Hf,,#.;w#7#7(//':R:R#Ra #*X#5a eAhoog666E!HHxx%%gv666r/   c                     t          t          |                    dd                                         }t          |          dk    rd|z  }d|z   S )z!Escape char as RFC 2396 specifiesr   Nrf   z0%srm   )hexrz   r   r:   )rD   charhex_reprs      r-   	hexescapez_RuleSet.hexescape   sM    s4yy>>!""%++--x==Ax'HX~r/   c                 D   t          |          }|                     |j        d          }t          j        r%t          |                    d          d          }nt          |d          }t          dd||j        |j	        |j
                  }t          |          }|S )zReturn percent encoded path.z/%r   r~   saferM   r   r   pathrp   rq   r   r   r   paramsqueryfragmentr   )rD   r   r   s      r-   _quote_pathz_RuleSet._quote_path   s    }}UZ}557 	*W--D999DDD)))DBD%,U^TT%  r/   c                    d}|d         dk    s|d         dk    s|d         dk    r|d         }|d d         }t          |          }|                     |j        d          }t          j        r%t          |                    d          d	
          }nt          |d	
          }t          dd||z   |j        |j	        |j
                  }t          |          }|S )NrM   r8   ?;r    z/*$%r   r~   z/*%r   r   )rD   r3   	last_charr   s       r-   _quote_patternz_RuleSet._quote_pattern   s     	2;#!3!3wr{c7I7IIcrclG!!--
6-::7 	1GNN733%@@@GGG%000GBGi$7u{TYTbccU##r/   c                    d|v r<|                      |                    d|                     d                               |                     |          }|sd S | j                            t          dt          |                               |                    d          r"|                      |d d         dz              d S d S )Nr    r   r
   r   z/index.htmli)	r   rk   r   r   r`   append_Ruler6   r=   rC   s     r-   r   z_RuleSet.allow   s    '>>JJwsDNN3,?,?@@AAA%%g.. 	F5wk'6J6JKKKLLL M** 	,JJwtt}s*+++++	, 	,r/   c                 $   d|v r<|                      |                    d|                     d                               |                     |          }|sd S | j                            t          dt          |                               d S )Nr    r   r   )r   rk   r   r   r`   r   r   r6   rC   s     r-   r   z_RuleSet.disallow   s    '>>MM'//#t~~c/B/BCCDDD%%g.. 	F5zW9M9MNNNOOOOOr/   c                 @    | j                             d d           d S )Nc                 0    | j         j        | j        dk    fS )Nr   )r   r;   r
   )rs    r-   <lambda>z)_RuleSet.finalize_rules.<locals>.<lambda>   s    (8!'W:L'M r/   T)keyreverse)r`   sortrD   s    r-   finalize_rulesz_RuleSet.finalize_rules   s(    MMW[\\\\\r/   c                     |                      |          }d}| j        D ]+}|j                            |          r|j        dk    rd} n,|S )z!Return if the url can be fetched.Tr   F)r   r`   r   rG   r
   )rD   rK   allowedrules       r-   	can_fetchz_RuleSet.can_fetch   sf    s##K 	 	Dz$$ :++#G r/   c                     | j         S )z'Get & set crawl delay for the rule set.)ra   r   s    r-   crawl_delayz_RuleSet.crawl_delay   s       r/   c                     	 t          |          }nI# t          $ r< t                              d                    | j        j        |                     Y d S w xY w|| _        d S )NzOMalformed rule at line {} : cannot set crawl delay to '{}'. Ignoring this rule.)float
ValueErrorloggerdebugry   rc   _total_line_seenra   )rD   delays     r-   r   z_RuleSet.crawl_delay   sz    	%LLEE 	 	 	LL //5vd6K6\^c/d/df f fFF		 "s    AAAc                     | j         S )z(Get & set request rate for the rule set.)rb   r   s    r-   request_ratez_RuleSet.request_rate   s     ~r/   c                 R   	 |                                 }t          |          dk    r|\  }}n
|d         d}}|                     d          \  }}|d                                         }t          |          t          |d d                   }}|dk    r|dz  }n|dk    r|d	z  }n|d
k    r|dz  }d }d }	|r|                     d          \  }}	t	          t          |d d                   t          |dd                              }t	          t          |	d d                   t          |	dd                              }	nI# t
          $ r< t                              d                    | j	        j
        |                     Y d S w xY wt          ||||	          | _        d S )Nr   r   rM   r1   r8   m<   rv   i  diQ -zSMalformed rule at line {} : cannot set request rate using '{}'. Ignoring this rule.)rO   r:   rh   rs   r   	Exceptionr   r   ry   rc   r   r   rb   )
rD   r   r   ratetime_periodr   r   	time_unitr   r   s
             r-   r   z_RuleSet.request_rate   s   	KKMME5zzQ$)!kk$)!Hbk $

3Hg))++I #Hs73B3</@/@gHC2c!!4c!!5 JH G'2'8'8'='=$
H!#j!n"5"5s:bcc?7K7KLL
HRaRL 1 13x}3E3EFF 	 	 	LL //5vd6K6\^c/d/df f fFF		 %Xw
HMMs   EE AFFN)rM   rk   )rW   rX   rY   rZ   rE   rj   r   r   r   r   r   r   r   r   propertyr   setterr   r[   r/   r-   r]   r]   c   s5       ==0 0 0  "7 "7 "7 "7H      (, , ,P P P] ] ]	 	 	 ! ! X! 	" 	" 	"   X N N N N Nr/   r]   c                       e Zd Zd Zed             Zd Zd Zd Zd Z	d Z
ed             Zed	             Zed
             ZdS )r"   c                 h    i | _         d | _        g | _        i | _        d| _        d| _        d| _        d S Nr   )_user_agents_host_sitemap_list_matched_rule_setr   _invalid_directive_seen_total_directive_seenr   s    r-   rE   zProtego.__init__"  sE     
   "$ !'($%&"""r/   c                 D     |             }|                     |           |S r$   )_parse_robotstxt)clscontentos      r-   parsezProtego.parse3  s&    CEE	7###r/   c                 \   |                                 }g }d }|D ]_}| xj        dz  c_        |                    d          }|dk    r|d|                                         }|                                }|sa|                    d          dk    r|                    dd          \  }}n|                    d          }	t          |	          dk     r|	d         }
t          dt          |	                    D ]@}t          |
          r!|
d                    |	|d                    }} n|
d|	|         z   z  }
A%|                                	                                }|                                }|s|}e|s=|t          vr4t                              d                    | j                             | xj        dz  c_        |t          v r|r|t          vrg }|                                	                                }d }|d	k    rd	|v r|                    d	d
          }||fD ]r}|s| j                            |d           }|r||vr|                    |           |s5t'          |           }||_        || j        |<   |                    |           sn|t*          v r(|D ]$}|                    t/          |                     %n|t0          v r(|D ]$}|                    t/          |                     %nq|t4          v r| j                            |           nM|t8          v r|D ]	}||_        
n7|t<          v r|D ]	}||_        
n!|t@          v r|| _!        n| xj"        dz  c_"        |}a| j        #                                D ]}|$                                 d S )Nrf   #r8   r   : r   z8Rule at line {} without any user agent to enforce it on.r   rM   )%
splitlinesr   r?   rg   rO   r:   r   r.   rS   rh   r(   r   r   ry   r   rk   r   getr   r]   r_   r'   r   r4   r&   r   r)   r   r*   r   r+   r   r,   r   r   valuesr   )rD   r   linescurrent_rule_setsprevious_rule_fieldlinehash_posr
   r   r   possible_filedr   r_   user_agent_without_asteriskrule_sets                  r-   r   zProtego._parse_robotstxt9  si   ""$$  # `	( `	(D!!Q&!! yy~~H2~~AxK(..00 ::<<D  yy~~###zz#q11uu 

3u::>>!&qq#e**--  A0@@ '5sxxabb	7J7Ju"cE!Hn4NNKKMM''))EKKMME  &+# % 6K)K)KW^^_c_tuuvvv&&!+&&---& ++>F[+[+[(*% #[[]]0022
.2+$$
):):2<2D2DS"2M2M/#-/J"K ; ;J% ! #044ZFFH ;H4E$E$E)00:::# ;#+D>>.8+8@)*5)00:::; *** 1 9 9HNN=#7#788889 --- 1 < <H%%mE&:&:;;;;< ,,,"))%0000000 1 1 1H+0H((1 111 1 2 2H,1H))2 /))"

 ,,1,,"'+2244 	( 	(J%%''''	( 	(r/   c                     | j         sdS | j        v r| j                 S fd| j                                         D             }t          |d           \  }}|sd| j        <   dS || j        <   |S )z0Return the rule set with highest matching score.Nc              3   F   K   | ]}|                               |fV  d S r$   )rj   )r{   rsr_   s     r-   	<genexpr>z1Protego._get_matching_rule_set.<locals>.<genexpr>  s4      eeBz!:!:B ?eeeeeer/   c                     | d         S r   r[   )ps    r-   r   z0Protego._get_matching_rule_set.<locals>.<lambda>  s    PQRSPT r/   )r   )r   r   r   max)rD   r_   score_rule_set_pairsmatch_scorematched_rule_sets    `   r-   _get_matching_rule_setzProtego._get_matching_rule_set  s      	4///)*55eeee$J[JbJbJdJdeee(+,@nn(U(U(U%% 	15D":.4-=z*r/   c                 ^    |                      |          }|sdS |                    |          S )zHReturn True if the user agent can fetch the URL, otherwise return False.T)r   r   )rD   rK   r_   r   s       r-   r   zProtego.can_fetch  s7    66zBB 	4))#...r/   c                 B    |                      |          }|sdS |j        S )zvReturn the crawl delay specified for the user agent as a float.
        If nothing is specified, return None.
        N)r   r   rD   r_   r   s      r-   r   zProtego.crawl_delay  s.      66zBB 	4++r/   c                 B    |                      |          }|sdS |j        S )zReturn the request rate specified for the user agent as a named tuple
        RequestRate(requests, seconds, start_time, end_time). If nothing is
        specified, return None.
        N)r   r   r   s      r-   r   zProtego.request_rate  s.    
  66zBB 	4,,r/   c                 *    t          | j                  S )z7Get an iterator containing links to sitemaps specified.)iterr   r   s    r-   r   zProtego.sitemaps  s     D&'''r/   c                     | j         S )zGet the preferred host.)r   r   s    r-   preferred_hostzProtego.preferred_host  s     zr/   c                      | j         | j        z
  S r$   )r   r   r   s    r-   _valid_directive_seenzProtego._valid_directive_seen  s    )D,HHHr/   N)rW   rX   rY   rE   classmethodr   r   r   r   r   r   r   r   r   r   r[   r/   r-   r"   r"      s        ' ' '"   [
m( m( m(^      / / /, , ,- - - ( ( X(   X I I XI I Ir/   )"loggingrI   collectionsr   datetimer   rp   six.moves.urllib.parser   r   r   r   	getLoggerrW   r   r   r   r&   r'   r(   r)   r*   r+   r,   rQ   r   r   __all__r.   r4   objectr6   r]   r"   r[   r/   r-   <module>r     s    				 " " " " " "       



0 0 0 0 0 0 0 0 0 0 0 0 
	8	$	$
6GW-..jDDDF F _^^ 9 AAA 888 '7 )>: (3Z
c*++)
$+ + +  0 0 0 0 0& 0 0 0fzN zN zN zN zNv zN zN zNz~I ~I ~I ~I ~If ~I ~I ~I ~I ~Ir/   