
    Dw^^B                     >   d dl Z d dlZd dlmZ d dlmZ d dlZd dlmZm	Z	m
Z
mZ  e j                  e      Z edddg      Z edg d	      Zh d
ZdhZh dZh dZddhZddhZdhZddhZ ed      ZddgZd Zd Z G d de      Z G d de      Z  G d de      Z!y)    N)
namedtuple)time)ParseResultquoteurlparse
urlunparseRulefieldvalueRequestRate)requestsseconds
start_timeend_time>   disalowdiasllowdisallawdisallowdissalow	dissallowallow>   
user agent
user-agent	useragent>   site-mapsitemapsitemapszcrawl-delayzcrawl delayzrequest-ratezrequest ratehost*$0123456789ABCDEFabcdefProtegoc           
      z    t        | t        v | t        v | t        v | t        v | t
        v | t        v | t        v g      S N)any_DISALLOW_DIRECTIVE_ALLOW_DIRECTIVE_USER_AGENT_DIRECTIVE_SITEMAP_DIRECTIVE_CRAWL_DELAY_DIRECTIVE_REQUEST_RATE_DIRECTIVE_HOST_DIRECTIVE)r
   s    'lib/python3.12/site-packages/protego.py_is_valid_directive_fieldr.      sQ    ,,))..++//00(* + +    c                 2    | j                  d      r| S d| z   S )N/)
startswith)patterns    r-   _enforce_pathr4   )   s    #=r/   c                   "    e Zd ZdZd Zd Zd Zy)_URLPatternz.Internal class which represents a URL pattern.c                 h   || _         t        |      | _        d| j                   v | _        | j                   j	                  d      | _        | j                  r5| j                   d | j                   j                  d       | _        d| _	        y | j
                  r| j                   d d | _        d| _	        y )Nr   r    F)
_patternlenpriority_contains_asteriskendswith_contains_dollarfind_pattern_before_asterisk_pattern_before_dollar_pattern_compiledselfr3   s     r-   __init__z_URLPattern.__init__3   s    G"%"6 $ 6 6s ;"",0MM:R4==;M;Mc;R,SD) "' ""*.--*<D'!&r/   c                    | j                   r| j                  j                  |      S | j                  s6| j                  s|j                  | j                        S || j                  k(  S |j                  | j                        sy| j                  | j                        | _        t        j                  | j                        | _        d| _         | j                  j                  |      S )zDRetun True if pattern matches the given URL, otherwise return False.FT)rB   r9   matchr<   r>   r2   rA   r@   _prepare_pattern_for_regexrecompile)rD   urls     r-   rG   z_URLPattern.match@   s     !!==&&s++&&((~~dmm44 $5555~~d;;<77F

4==1!%}}""3''r/   c                    t        j                  dd|      }t        j                  d|      }t        |      D ]4  \  }}|t        vrt        j
                  |      ||<   '||   dk(  s0d||<   6 dj                  |      }|S )z:Return equivalent regex pattern for the given URL pattern.z\*+r   z(\*|\$$)z.*? )rI   subsplit	enumerate
_WILDCARDSescapejoin)rD   r3   sindexsubstrs        r-   rH   z&_URLPattern._prepare_pattern_for_regexV   sx    &&g.HH['*&q\ME6Z'99V,%5S %	 *
 ''!*r/   N)__name__
__module____qualname____doc__rE   rG   rH    r/   r-   r6   r6   0   s    8'(,
r/   r6   c                       e Zd ZdZd Zd ZddZd Zd Zd Z	d Z
d	 Zd
 Zd Zed        Zej                   d        Zed        Zej                   d        Zy)_RuleSetz3Internal class which stores rules for a user agent.c                 J    d | _         g | _        d | _        d | _        || _        y r$   )
user_agent_rules_crawl_delay	_req_rate_parser_instance)rD   parser_instances     r-   rE   z_RuleSet.__init__f   s'      /r/   c                     |j                         j                         }| j                  dk(  ry| j                  |v rt        | j                        S y)zReturn matching score.r      r   )striplowerr_   r:   )rD   	robotnames     r-   
applies_toz_RuleSet.applies_tom   sD    OO%++-	??c!??i't''r/   c                 `   d|vr|S d }|D ch c]  }dj                  t        |             }}|j                  d      }|d   j                  d      |d<   t	        dt        |            D ]  }t        ||         dk\  rlt        ||   dd       j                  t              rH||   dd j                         }||   dd }	||vr ||      |	j                  d      z   ||<   x||	z   ||<   d	||   j                  d      z   ||<    d
j                  |      j                  d|      S c c}w )z9Replace %xy escapes by their single-character equivalent.%c                 v    t         j                  rt        t        | d            S t        j                  |       S )z6Replaces a %xx escape with equivalent binary sequence.   )sixPY2chrintbytesfromhex)hs    r-   hex_to_bytez&_RuleSet._unquote.<locals>.hex_to_byte{   s(    ww3q":&==##r/   z{:02X}r   utf-8rf      N   %r/   )formatordrO   encoderanger:   setissubset_HEX_DIGITSupperrS   decode)
rD   rK   ignoreerrorsrv   cpartsihexcodeleftovers
             r-   _unquotez_RuleSet._unquotev   s6   c>J	$ 4::6a(//#a&)6:		#8??7+aq#e*%A58}!uQx|$--k:#Ahrl002G$Qx|Hf,#.w#7(//':R#Ra #*X#5a eAhoog66E!H &  xx%%gv66+ ;s   !D+c                 |    t        t        |            dd j                         }t        |      dk(  rd|z  }d|z   S )z!Escape char as RFC 2396 specifiesrx   Nrf   z0%srl   )hexr{   r   r:   )rD   charhex_reprs      r-   	hexescapez_RuleSet.hexescape   s?    s4y>!"%++-x=Ax'HX~r/   c                 <   t        |      }| j                  |j                  d      }t        j                  rt        |j                  d      d      }nt        |d      }t        dd||j                  |j                  |j                        }t        |      }|S )zReturn percent encoded path.z/%r   rw   saferM   r   r   pathro   rp   r   r|   r   paramsqueryfragmentr   )rD   r   r   s      r-   _quote_pathz_RuleSet._quote_path   sx    }}UZZ}577W-D9DD)DBD%,,U^^T% r/   c                    d}|d   dk(  s|d   dk(  s|d   dk(  r
|d   }|d d }t        |      }| j                  |j                  d      }t        j                  rt        |j                  d      d	
      }nt        |d	
      }t        dd||z   |j                  |j                  |j                        }t        |      }|S )NrM   r8   ?;r    z/*$%r   rw   z/*%r   r   )rD   r3   	last_charr   s       r-   _quote_patternz_RuleSet._quote_pattern   s     	2;#!3wr{c7IIcrlG!--

6-:77GNN73%@GG%0GBGi$7u{{TYTbTbcU#r/   c                 D   d|v r0| j                  |j                  d| j                  d                   | j                  |      }|sy | j                  j                  t        dt        |                   |j                  d      r| j                  |d d dz          y y )Nr    r   r
   r   z/index.htmli)	r   replacer   r   r`   append_Ruler6   r=   rC   s     r-   r   z_RuleSet.allow   s    '>JJwsDNN3,?@A%%g.5wk'6JKL M*JJwt}s*+ +r/   c                     d|v r0| j                  |j                  d| j                  d                   | j                  |      }|sy | j                  j                  t        dt        |                   y )Nr    r   r   )r   r   r   r   r`   r   r   r6   rC   s     r-   r   z_RuleSet.disallow   s]    '>MM'//#t~~c/BCD%%g.5zW9MNOr/   c                 @    | j                   j                  d d       y )Nc                 L    | j                   j                  | j                  dk(  fS )Nr   )r   r;   r
   )rs    r-   <lambda>z)_RuleSet.finalize_rules.<locals>.<lambda>   s    (8(8!''W:L'Mr/   T)keyreverse)r`   sortrD   s    r-   finalize_rulesz_RuleSet.finalize_rules   s    MW[\r/   c                     | j                  |      }d}| j                  D ]1  }|j                  j                  |      s|j                  dk(  rd} |S  |S )z!Return if the url can be fetched.Tr   F)r   r`   r   rG   r
   )rD   rK   allowedrules       r-   	can_fetchz_RuleSet.can_fetch   sW    s#KKDzz$::+#G  
 r/   c                     | j                   S )z'Get & set crawl delay for the rule set.)ra   r   s    r-   crawl_delayz_RuleSet.crawl_delay   s        r/   c                     	 t        |      }|| _        y # t        $ r< t        j                  dj	                  | j
                  j                  |             Y y w xY w)NzOMalformed rule at line {} : cannot set crawl delay to '{}'. Ignoring this rule.)float
ValueErrorloggerdebugrz   rc   _total_line_seenra   )rD   delays     r-   r   z_RuleSet.crawl_delay   sY    	%LE "  	LL //5vd6K6K6\6\^c/df		s    AAAc                     | j                   S )z(Get & set request rate for the rule set.)rb   r   s    r-   request_ratez_RuleSet.request_rate   s     ~~r/   c                    	 |j                         }t        |      dk(  r|\  }}n|d   d}}|j                  d      \  }}|d   j                         }t        |      t        |d d       }}|dk(  r|dz  }n|dk(  r|d	z  }n
|d
k(  r|dz  }d }d }	|r\|j                  d      \  }}	t	        t        |d d       t        |dd              }t	        t        |	d d       t        |	dd              }	t        ||||	      | _        y # t
        $ r< t        j                  dj                  | j                  j                  |             Y y w xY w)Nrx   r   rM   r1   r8   m<   ru   i  diQ -zSMalformed rule at line {} : cannot set request rate using '{}'. Ignoring this rule.)rO   r:   rh   rr   r   	Exceptionr   r   rz   rc   r   r   rb   )
rD   r   r   ratetime_periodr   r   	time_unitr   r   s
             r-   r   z_RuleSet.request_rate   s]   	KKME5zQ$)!k$)!Hbk $

3Hg))+I #Hs73B</@gHC2c!4c!5 JH'2'8'8'=$
H!#j!n"5s:bc?7KL
HRaL 13x}3EF %Xw
HM  	LL //5vd6K6K6\6\^c/df		s   C-D AEEN)rM   r   )rW   rX   rY   rZ   rE   rj   r   r   r   r   r   r   r   r   propertyr   setterr   r[   r/   r-   r]   r]   c   s    =0"7H(,P]	 ! ! 	" 	"   N Nr/   r]   c                   p    e Zd Zd Zed        Zd Zd Zd Zd Z	d Z
ed        Zed	        Zed
        Zy)r"   c                 f    i | _         d | _        g | _        i | _        d| _        d| _        d| _        y Nr   )_user_agents_host_sitemap_list_matched_rule_setr   _invalid_directive_seen_total_directive_seenr   s    r-   rE   zProtego.__init__"  s?     
   "$ !'($%&"r/   c                 6     |        }|j                  |       |S r$   )_parse_robotstxt)clscontentos      r-   parsezProtego.parse3  s    E	7#r/   c                    |j                         }g }d }|D ]   }| xj                  dz  c_        |j                  d      }|dk7  r|d| j                         }|j                         }|sU|j                  d      dk7  r|j	                  dd      \  }}nn|j	                  d      }	t        |	      dk  r|	d   }
t        dt        |	            D ]0  }t        |
      r|
dj                  |	|d        }} n|
d|	|   z   z  }
2 |j                         j                         }|j                         }|s|}!|s8|t        vr0t        j                  dj                  | j                               [| xj                  dz  c_        |t        v r|r
|t        vrg }|j                         j                         }d }|d	k7  rd	|v r|j                  d	d
      }||fD ]m  }|s| j                   j#                  |d       }|r||vr|j%                  |       |r<t'        |       }||_        || j                   |<   |j%                  |       o n|t*        v r"|D ]  }|j-                  t/        |              n|t0        v r"|D ]  }|j3                  t/        |              nw|t4        v r| j6                  j%                  |       nS|t8        v r|D ]	  }||_         n<|t<        v r|D ]	  }||_         n%|t@        v r|| _!        n| xjD                  dz  c_"        |} | j                   jG                         D ]  }|jI                           y )Nrf   #r8   r   : rx   z8Rule at line {} without any user agent to enforce it on.r   rM   )%
splitlinesr   r?   rg   rO   r:   r}   r.   rS   rh   r(   r   r   rz   r   r   r   getr   r]   r_   r'   r   r4   r&   r   r)   r   r*   r   r+   r   r,   r   r   valuesr   )rD   r   linescurrent_rule_setsprevious_rule_fieldlinehash_posr
   r   r   possible_filedr   r_   user_agent_without_asteriskrule_sets                  r-   r   zProtego._parse_robotstxt9  sL   ""$  #D!!Q&! yy~H2~Ax(..0 ::<D yy~##zz#q1u 

3u:>!&qq#e*-A0@'5sxxab	7Ju"cE!Hn4N	 . KKM'')EKKME &+# %6K)KW^^_c_t_tuv&&!+&--&+>F[+[(*% #[[]002
.2+$
):2<2D2DS"2M/#-/J"KJ% #0044ZFHH4E$E)00:##+D>.8+8@))*5)00: #L ** 1HNN=#78 !2 -- 1H%%mE&:; !2 ,,""))%000 1H+0H( !2 11 1H,1H) !2 /)"
 ,,1,"'A D ++224J%%' 5r/   c                    | j                   sy| j                  v r| j                     S fd| j                   j                         D        }t        |d       \  }}|sd| j                  <   y|| j                  <   |S )z0Return the rule set with highest matching score.Nc              3   D   K   | ]  }|j                        |f  y wr$   )rj   ).0rsr_   s     r-   	<genexpr>z1Protego._get_matching_rule_set.<locals>.<genexpr>  s"     eJdBz!:B ?Jds    c                     | d   S r   r[   )ps    r-   r   z0Protego._get_matching_rule_set.<locals>.<lambda>  s    PQRSPTr/   )r   )r   r   r   max)rD   r_   score_rule_set_pairsmatch_scorematched_rule_sets    `   r-   _get_matching_rule_setzProtego._get_matching_rule_set  s      ///))*55e$J[J[JbJbJde(+,@n(U%%15D"":.-=z*r/   c                 L    | j                  |      }|sy|j                  |      S )zHReturn True if the user agent can fetch the URL, otherwise return False.T)r   r   )rD   rK   r_   r   s       r-   r   zProtego.can_fetch  s*    66zB))#..r/   c                 B    | j                  |      }|sy|j                  S )zvReturn the crawl delay specified for the user agent as a float.
        If nothing is specified, return None.
        N)r   r   rD   r_   r   s      r-   r   zProtego.crawl_delay  s'      66zB+++r/   c                 B    | j                  |      }|sy|j                  S )zReturn the request rate specified for the user agent as a named tuple
        RequestRate(requests, seconds, start_time, end_time). If nothing is
        specified, return None.
        N)r   r   r   s      r-   r   zProtego.request_rate  s'    
  66zB,,,r/   c                 ,    t        | j                        S )z7Get an iterator containing links to sitemaps specified.)iterr   r   s    r-   r   zProtego.sitemaps  s     D&&''r/   c                     | j                   S )zGet the preferred host.)r   r   s    r-   preferred_hostzProtego.preferred_host  s     zzr/   c                 4    | j                   | j                  z
  S r$   )r   r   r   s    r-   _valid_directive_seenzProtego._valid_directive_seen  s    ))D,H,HHHr/   N)rW   rX   rY   rE   classmethodr   r   r   r   r   r   r   r   r   r   r[   r/   r-   r"   r"      st    '"  
m(^  /,- ( (   I Ir/   )"loggingrI   collectionsr   datetimer   ro   six.moves.urllib.parser   r   r   r   	getLoggerrW   r   r   r   r&   r'   r(   r)   r*   r+   r,   rQ   r~   r   __all__r.   r4   objectr6   r]   r"   r[   r/   r-   <module>r     s     	 "  
0 0 
		8	$6GW-.DF _ 9 A 8 '7 )>: (3Z
*+)
$+0& 0fzNv zNz~If ~Ir/   