B
    ^ Çcî"  ã               @   s\   d Z ddlZddlZddlZdgZe dd¡ZG dd„ dƒZG dd„ dƒZ	G d	d
„ d
ƒZ
dS )a%   robotparser.py

    Copyright (C) 2000  Bastian Kleineidam

    You can choose between two licenses when using this package:
    1) GNU GPLv2
    2) PSF license for Python 2.2

    The robots.txt Exclusion Protocol is implemented as specified in
    http://www.robotstxt.org/norobots-rfc.txt
é    NÚRobotFileParserÚRequestRatezrequests secondsc               @   sj   e Zd ZdZddd„Zdd„ Zdd„ Zd	d
„ Zdd„ Zdd„ Z	dd„ Z
dd„ Zdd„ Zdd„ Zdd„ ZdS )r   zs This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file.

    Ú c             C   s,   g | _ d | _d| _d| _|  |¡ d| _d S )NFr   )ÚentriesÚdefault_entryÚdisallow_allÚ	allow_allÚset_urlÚlast_checked)ÚselfÚurl© r   ú#lib/python3.7/urllib/robotparser.pyÚ__init__   s    
zRobotFileParser.__init__c             C   s   | j S )z·Returns the time the robots.txt file was last fetched.

        This is useful for long-running web spiders that need to
        check for new robots.txt files periodically.

        )r
   )r   r   r   r   Úmtime$   s    zRobotFileParser.mtimec             C   s   ddl }|  ¡ | _dS )zYSets the time the robots.txt file was last fetched to the
        current time.

        r   N)Útimer
   )r   r   r   r   r   Úmodified-   s    zRobotFileParser.modifiedc             C   s&   || _ tj |¡dd… \| _| _dS )z,Sets the URL referring to a robots.txt file.é   é   N)r   ÚurllibÚparseÚurlparseZhostÚpath)r   r   r   r   r   r	   5   s    zRobotFileParser.set_urlc          
   C   s†   yt j | j¡}W nR t jjk
rd } z0|jdkr:d| _n|jdkrT|jdk rTd| _W dd}~X Y nX | 	¡ }|  
| d¡ ¡ ¡ dS )z4Reads the robots.txt URL and feeds it to the parser.)i‘  i“  Ti  iô  Nzutf-8)r   ZrequestZurlopenr   ÚerrorZ	HTTPErrorÚcoder   r   Úreadr   ÚdecodeÚ
splitlines)r   ÚfÚerrÚrawr   r   r   r   :   s    
zRobotFileParser.readc             C   s,   d|j kr| jd kr(|| _n| j |¡ d S )NÚ*)Ú
useragentsr   r   Úappend)r   Úentryr   r   r   Ú
_add_entryG   s    

zRobotFileParser._add_entryc             C   s6  d}t ƒ }|  ¡  x|D ]þ}|sT|dkr8t ƒ }d}n|dkrT|  |¡ t ƒ }d}| d¡}|dkrr|d|… }| ¡ }|s€q| dd¡}t|ƒdkr|d  ¡  ¡ |d< tj	 
|d  ¡ ¡|d< |d dkr |dkrê|  |¡ t ƒ }|j |d ¡ d}q|d dkr4|dkr|j t|d d	ƒ¡ d}q|d d
krh|dkr|j t|d dƒ¡ d}q|d dkr¦|dkr|d  ¡  ¡ r t|d ƒ|_d}q|d dkr|dkr|d  d¡}t|ƒdkr|d  ¡  ¡ r|d  ¡  ¡ rtt|d ƒt|d ƒƒ|_d}qW |dkr2|  |¡ dS )z”Parse the input lines from a robots.txt file.

        We allow that a user-agent: line is not preceded by
        one or more blank lines.
        r   r   é   ú#Nú:z
user-agentZdisallowFZallowTzcrawl-delayzrequest-rateú/)ÚEntryr   r%   ÚfindÚstripÚsplitÚlenÚlowerr   r   Úunquoter"   r#   Ú	rulelinesÚRuleLineÚisdigitÚintÚdelayr   Úreq_rate)r   ÚlinesÚstater$   ÚlineÚiZnumbersr   r   r   r   P   sd    






 
zRobotFileParser.parsec             C   s    | j r
dS | jrdS | jsdS tj tj |¡¡}tj dd|j|j	|j
|jf¡}tj |¡}|sfd}x"| jD ]}| |¡rn| |¡S qnW | jrœ| j |¡S dS )z=using the parsed robots.txt decide if useragent can fetch urlFTr   r)   )r   r   r
   r   r   r   r0   Ú
urlunparser   ZparamsZqueryZfragmentÚquoter   Ú
applies_toÚ	allowancer   )r   Ú	useragentr   Z
parsed_urlr$   r   r   r   Ú	can_fetch“   s$    
zRobotFileParser.can_fetchc             C   s>   |   ¡ sd S x| jD ]}| |¡r|jS qW | jr:| jjS d S )N)r   r   r=   r5   r   )r   r?   r$   r   r   r   Úcrawl_delay°   s    

zRobotFileParser.crawl_delayc             C   s>   |   ¡ sd S x| jD ]}| |¡r|jS qW | jr:| jjS d S )N)r   r   r=   r6   r   )r   r?   r$   r   r   r   Úrequest_rateº   s    

zRobotFileParser.request_ratec             C   s0   | j }| jd k	r|| jg }d tt|ƒ¡d S )NÚ
)r   r   ÚjoinÚmapÚstr)r   r   r   r   r   Ú__str__Ä   s    
zRobotFileParser.__str__N)r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   r	   r   r%   r   r@   rA   rB   rG   r   r   r   r   r      s   
		C

c               @   s(   e Zd ZdZdd„ Zdd„ Zdd„ ZdS )	r2   zoA rule line is a single "Allow:" (allowance==True) or "Disallow:"
       (allowance==False) followed by a path.c             C   s<   |dkr|sd}t j t j |¡¡}t j |¡| _|| _d S )Nr   T)r   r   r;   r   r<   r   r>   )r   r   r>   r   r   r   r   Î   s
    zRuleLine.__init__c             C   s   | j dkp| | j ¡S )Nr!   )r   Ú
startswith)r   Úfilenamer   r   r   r=   Ö   s    zRuleLine.applies_toc             C   s   | j r
dndd | j S )NZAllowZDisallowz: )r>   r   )r   r   r   r   rG   Ù   s    zRuleLine.__str__N)rH   rI   rJ   rK   r   r=   rG   r   r   r   r   r2   Ë   s   r2   c               @   s0   e Zd ZdZdd„ Zdd„ Zdd„ Zdd	„ Zd
S )r*   z?An entry has one or more user-agents and zero or more rulelinesc             C   s   g | _ g | _d | _d | _d S )N)r"   r1   r5   r6   )r   r   r   r   r   ß   s    zEntry.__init__c             C   s   g }x| j D ]}| d|› ¡ qW | jd k	r@| d| j› ¡ | jd k	rj| j}| d|j› d|j› ¡ | tt| j	ƒ¡ | d¡ d 
|¡S )NzUser-agent: zCrawl-delay: zRequest-rate: r)   r   rC   )r"   r#   r5   r6   ZrequestsZsecondsÚextendrE   rF   r1   rD   )r   ZretÚagentZrater   r   r   rG   å   s    


zEntry.__str__c             C   sF   |  d¡d  ¡ }x.| jD ]$}|dkr*dS | ¡ }||krdS qW dS )z2check if this entry applies to the specified agentr)   r   r!   TF)r-   r/   r"   )r   r?   rO   r   r   r   r=   ò   s    zEntry.applies_toc             C   s$   x| j D ]}| |¡r|jS qW dS )zZPreconditions:
        - our agent applies to this entry
        - filename is URL decodedT)r1   r=   r>   )r   rM   r9   r   r   r   r>   ÿ   s    

zEntry.allowanceN)rH   rI   rJ   rK   r   rG   r=   r>   r   r   r   r   r*   Ý   s
   r*   )rK   ÚcollectionsZurllib.parser   Zurllib.requestÚ__all__Ú
namedtupler   r   r2   r*   r   r   r   r   Ú<module>   s    6