
    a-                         d Z ddlZddlZddlZddlmZ ddlmZmZ ddlm	Z	  ej
        dej                  Z ej
        dej                  Z ej
         ej        d          ej                  Z ej
         ej        d	          ej        ej        z            Z ej
        d
ej                  ZdZddZddZd dZd!dZ ej
        dej                  Zd dZd"dZd#dZ	 	 d$dZd%dZd&dZd'dZ d Z!dS )(z(
Functions for dealing with markup text
    N)moves)to_bytes
to_unicode)safe_url_stringzI&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)z<[a-zA-Z\/!].*?>z5<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']z}<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)z<((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))z 	
 Tutf-8c                 Z    t          j        dt                     t          | |||          S )z

    .. warning::

        This function is deprecated and will be removed in future.
        Please use :func:`replace_entities` instead.
    z`w3lib.html.remove_entities` function is deprecated and will be removed in future releases. Please use `w3lib.html.replace_entities` instead.)warningswarnDeprecationWarningreplace_entities)textkeepremove_illegalencodings       *lib/python3.11/site-packages/w3lib/html.pyremove_entitiesr      s8     M	1 		   D$AAA    c                 d    fd}t                               |t          | |                    S )u  Remove entities from the given `text` by converting them to their
    corresponding unicode character.

    `text` can be a unicode string or a byte string encoded in the given
    `encoding` (which defaults to 'utf-8').

    If `keep` is passed (with a list of entity names) those entities will
    be kept (they won't be removed).

    It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
    and named entities (such as ``&nbsp;`` or ``&gt;``).

    If `remove_illegal` is ``True``, entities that can't be converted are removed.
    If `remove_illegal` is ``False``, entities that can't be converted are kept "as
    is". For more information see the tests.

    Always returns a unicode string (with the entities removed).

    >>> import w3lib.html
    >>> w3lib.html.replace_entities(b'Price: &pound;100')
    u'Price: \xa3100'
    >>> print(w3lib.html.replace_entities(b'Price: &pound;100'))
    Price: £100
    >>>

    c                 8   |                                  }|                    d          rt          |d         d          }n|                    d          rt          |d         d          }n|                    d          r|d         }|                                v r|                     d          S t
          j        j                            |          p5t
          j        j                            |                                          }|\	 d|cxk    rdk    r*n n't          j	        |          
                    d	          S t          j        |          S # t          $ r Y nw xY wr|                    d
          rdn|                     d          S )Ndec
   hex   namedr         cp1252	semicolon )	groupdictgetintlowergroupr   html_entitiesname2codepointsixint2bytedecodeunichr
ValueError)mgroupsnumberentity_namer   r   s       r   convert_entityz(replace_entities.<locals>.convert_entityE   s   ::e 
	Q++FFZZ 	Q++FFZZ   	Q /K  ""d**wwqzz!-<@@MM P'6::;;L;L;N;NOO 
6))))T)))))<//66x@@@:f---    %PK)@)@PssaggajjPs   6E 
E 
E+*E+)_ent_resubr   )r   r   r   r   r1   s    ``  r   r   r   )   sH    8Q Q Q Q Q Q8 ;;~z$'A'ABBBr   c                 l    t          t                              t          | |                              S N)boolr2   searchr   r   r   s     r   has_entitiesr9   c   s&    z$99::;;;r   r    c                 T    t                               |t          | |                    S )af  Replace all markup tags found in the given `text` by the given token.
    By default `token` is an empty string so it just removes all tags.

    `text` can be a unicode string or a regular string encoded as `encoding`
    (or ``'utf-8'`` if `encoding` is not given.)

    Always returns a unicode string.

    Examples:

    >>> import w3lib.html
    >>> w3lib.html.replace_tags(u'This text contains <a>some tag</a>')
    u'This text contains some tag'
    >>> w3lib.html.replace_tags('<p>Je ne parle pas <b>fran\xe7ais</b></p>', ' -- ', 'latin-1')
    u' -- Je ne parle pas  -- fran\xe7ais --  -- '
    >>>

    )_tag_rer3   r   )r   tokenr   s      r   replace_tagsr=   f   s"    ( ;;ujx88999r   z<!--.*?(?:-->|$)c                 X    t          | |          } t                              d|           S )z Remove HTML Comments.

    >>> import w3lib.html
    >>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
    u'test  whatever'
    >>>

    r    )r   _REMOVECOMMENTS_REr3   r8   s     r   remove_commentsr@   ~   s)     dH%%D!!#t,,,r   c                    rr
J d            d D             d D             fdfd}d}t          j        |t           j        t           j        z            }|                    |t          | |                    S )a1   Remove HTML Tags only.

    `which_ones` and `keep` are both tuples, there are four cases:

    ==============  ============= ==========================================
    ``which_ones``  ``keep``      what it does
    ==============  ============= ==========================================
    **not empty**   empty         remove all tags in ``which_ones``
    empty           **not empty** remove all tags except the ones in ``keep``
    empty           empty         remove all tags
    **not empty**   **not empty** not allowed
    ==============  ============= ==========================================


    Remove all tags:

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags(doc)
    u'This is a link: example'
    >>>

    Keep only some tags:

    >>> w3lib.html.remove_tags(doc, keep=('div',))
    u'<div>This is a link: example</div>'
    >>>

    Remove only specific tags:

    >>> w3lib.html.remove_tags(doc, which_ones=('a','b'))
    u'<div><p>This is a link: example</p></div>'
    >>>

    You can't remove some and keep some:

    >>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',))
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/usr/local/lib/python2.7/dist-packages/w3lib/html.py", line 101, in remove_tags
        assert not (which_ones and keep), 'which_ones and keep can not be given at the same time'
    AssertionError: which_ones and keep can not be given at the same time
    >>>

    z5which_ones and keep can not be given at the same timec                 6    h | ]}|                                 S r   r$   .0tags     r   	<setcomp>zremove_tags.<locals>.<setcomp>   s     444##))++444r   c                 6    h | ]}|                                 S r   rC   rD   s     r   rG   zremove_tags.<locals>.<setcomp>   s     (((CCIIKK(((r   c                 @    |                                  } r| v S | vS r5   rC   )rF   r   
which_oness    r   will_removez remove_tags.<locals>.will_remove   s-    iikk 	#*$$d?"r   c                 r    |                      d          } |          rdn|                      d          S )N   r    r   )r%   )r-   rF   rK   s     r   
remove_tagzremove_tags.<locals>.remove_tag   s5    ggajj!k#&&6ssAGGAJJ6r   z</?([^ >/]+).*?>)recompileDOTALL
IGNORECASEr3   r   )r   rJ   r   r   rN   regexretagsrK   s    ``    @r   remove_tagsrU      s    ^ ]t]]&]]]]44444J((4(((D# # # # # #7 7 7 7 7 EZry2=899F::j*T8"<"<===r   c                     t          | |          } |rad                    d |D                       }t          j        |t          j        t          j        z            }|                    d|           } | S )a  Remove tags and their content.

    `which_ones` is a tuple of which tags to remove including their content.
    If is empty, returns the string unmodified.

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))
    u'<div><p> <a href="http://www.example.com">example</a></p></div>'
    >>>

    |c           	      &    g | ]}d |d|d|dS )<z\b.*?</z>|<z\s*/>r   rD   s     r   
<listcomp>z,remove_tags_with_content.<locals>.<listcomp>   s-    [[[ccc333D[[[r   r    )r   joinrO   rP   rQ   rR   r3   )r   rJ   r   tagsrT   s        r   remove_tags_with_contentr]      sn     dH%%D %xx[[PZ[[[\\D")bm";<<zz#t$$Kr   
	c                 x    t          | |          } |D ]&}|                     |t          ||                    } '| S )a$  Remove escape characters.

    `which_ones` is a tuple of which escape characters we want to remove.
    By default removes ``\n``, ``\t``, ``\r``.

    `replace_by` is the string to replace the escape characters by.
    It defaults to ``''``, meaning the escape characters are removed.

    )r   replace)r   rJ   
replace_byr   ecs        r   replace_escape_charsrf      sH     dH%%D B B||B
:x @ @AAKr   c                     d }t          | |          } d} || t                    D ]J}t          |t          j                  r|t          |||          z  }2||                    d          z  }K|S )a`  
    This function receives markup as a text (always a unicode string or
    a UTF-8 encoded string) and does the following:

    1. removes entities (except the ones in `keep`) from any part of it
        that is not inside a CDATA
    2. searches for CDATAs and extracts their text (if any) without modifying it.
    3. removes the found CDATAs

    c              3      K   d}|                     |           D ],}|                    d          \  }}| ||         V  |V  |}-| |d          V  d S )Nr   rM   )finditerspan)txtpatternoffsetmatchmatch_smatch_es         r   _get_fragmentsz&unquote_markup.<locals>._get_fragments  s{      %%c** 	 	E$zz!}}GWfWn%%%%KKKFF&''lr   r    )r   r   cdata_d)r   	_cdata_re
isinstancer(   string_typesr   r%   )r   r   r   r   rq   ret_textfragments          r   unquote_markuprx      s       dH%%DH"N433 2 2h 011 	2(^\\\\HH y111HHOr   c                     t          | |          } t                              |           }|rTt          j        j                            t          |          t          |                    d          |                    S t          |          S )zReturn the base url if declared in the given HTML `text`,
    relative to the given base url.

    If no base url is found, the given `baseurl` is returned.

    rM   )r   )	r   _baseurl_rer7   r   urllibparseurljoinr   r%   )r   baseurlr   r-   s       r   get_base_urlr     s     dH%%D4  A (|!))G$$AGGAJJ:::
 
 	

 w'''r   scriptnoscriptc                 0   t           j        rt          ||          }	 t          | |          } n# t          $ r t          |             w xY wt          | |          } t          t          |                     } t          
                    |           }|rt          |                    d                    }t          |                    d                              d          |          }t          j        j                            ||          }||fS dS )aY  Return  the http-equiv parameter of the HTML meta element from the given
    HTML text and return a tuple ``(interval, url)`` where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, ``(None, None)`` is returned.

    r#   urlz "')NN)r(   PY2r   r   UnicodeDecodeErrorprintr]   r@   r   _meta_refresh_rer7   floatr%   r   stripr   r{   r|   r}   )r   r~   r   ignore_tagsr-   intervalr   s          r   get_meta_refreshr   (  s    w .7H--$))   d $D+66D+D1122D%%A ((aggenn226::HEEl ((#66}zs	   / A
c                 6    |                      t                    S )a  
    Strip all leading and trailing space characters (as defined in
    https://www.w3.org/TR/html5/infrastructure.html#space-character).

    Such stripping is useful e.g. for processing HTML element attributes which
    contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard
    defines them as "valid URL potentially surrounded by spaces"
    or "valid non-empty URL potentially surrounded by spaces".

    >>> strip_html5_whitespace(' hello\n')
    'hello'
    )r   HTML5_WHITESPACE)r   s    r   strip_html5_whitespacer   E  s     ::&'''r   )r   Tr   r5   )r    N)r   r   N)r   N)r^   r    N)r   TN)r    r   )r    r   r   )"__doc__r
   rO   r(   r   
w3lib.utilr   r   	w3lib.urlr   rP   rR   r2   rQ   r;   uIrz   r   rs   r   r   r   r9   r=   r?   r@   rU   r]   rf   rx   r   r   r   r   r   r   <module>r      sP     				 



       + + + + + + + + % % % % % %
"*acecp
q
q
"*(")
4
4bjWXXZ\Z^__2:ece  %e  f  f  hj  hq  tv  tA  hA  B  B BJVXZXabb	  B B B B$8C 8C 8C 8Ct< < < <: : : :.  RZ 3RY?? - - - -B> B> B> B>H   , JM   "   @( ( ( ($   :( ( ( ( (r   