
    a-                     d   d Z ddlZddlZddlZddlmZ ddlmZmZ ddlm	Z	  ej                  dej                        Z ej                  dej                        Z ej                   ej                  d      ej                         Z ej                   ej                  d	      ej                  ej                  z        Z ej                  d
ej                        ZdZddZddZddZddZ ej                  dej                        ZddZddZddZ	 	 ddZddZd dZd!dZ d Z!y)"z(
Functions for dealing with markup text
    N)moves)to_bytes
to_unicode)safe_url_stringzI&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)z<[a-zA-Z\/!].*?>z5<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']z}<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)z<((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))z 	
c                 R    t        j                  dt               t        | |||      S )z

    .. warning::

        This function is deprecated and will be removed in future.
        Please use :func:`replace_entities` instead.
    z`w3lib.html.remove_entities` function is deprecated and will be removed in future releases. Please use `w3lib.html.replace_entities` instead.)warningswarnDeprecationWarningreplace_entities)textkeepremove_illegalencodings       *lib/python3.12/site-packages/w3lib/html.pyremove_entitiesr      s-     MM	1 		 D$AA    c                 R    fd}t         j                  |t        | |            S )u  Remove entities from the given `text` by converting them to their
    corresponding unicode character.

    `text` can be a unicode string or a byte string encoded in the given
    `encoding` (which defaults to 'utf-8').

    If `keep` is passed (with a list of entity names) those entities will
    be kept (they won't be removed).

    It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
    and named entities (such as ``&nbsp;`` or ``&gt;``).

    If `remove_illegal` is ``True``, entities that can't be converted are removed.
    If `remove_illegal` is ``False``, entities that can't be converted are kept "as
    is". For more information see the tests.

    Always returns a unicode string (with the entities removed).

    >>> import w3lib.html
    >>> w3lib.html.replace_entities(b'Price: &pound;100')
    u'Price: \xa3100'
    >>> print(w3lib.html.replace_entities(b'Price: &pound;100'))
    Price: £100
    >>>

    c                    | j                         }|j                  d      rt        |d   d      }n|j                  d      rt        |d   d      }n|j                  d      r|d   }|j                         v r| j	                  d      S t
        j                  j                  j                  |      xs7 t
        j                  j                  j                  |j                               }H	 d|cxk  rdk  r'n n$t        j                  |      j                  d	      S t        j                  |      S r|j                  d
      rdS | j	                  d      S # t        $ r Y 1w xY w)Ndec
   hex   namedr         cp1252	semicolon )	groupdictgetintlowergroupr   html_entitiesname2codepointsixint2bytedecodeunichr
ValueError)mgroupsnumberentity_namer   r   s       r   convert_entityz(replace_entities.<locals>.convert_entityE   s9   ::e+FZZ+FZZ  /K  "d*wwqz!--<<@@M P''66::;;L;L;NO 
6)T)<</66x@@::f-- %K)@sPaggajP  s   21E $E 	E+*E+)_ent_resubr   )r   r   r   r   r/   s    ``  r   r   r   )   s#    8Q8 ;;~z$'ABBr   c                 R    t        t        j                  t        | |                  S N)boolr0   searchr   r   r   s     r   has_entitiesr7   c   s    z$9:;;r   c                 B    t         j                  |t        | |            S )af  Replace all markup tags found in the given `text` by the given token.
    By default `token` is an empty string so it just removes all tags.

    `text` can be a unicode string or a regular string encoded as `encoding`
    (or ``'utf-8'`` if `encoding` is not given.)

    Always returns a unicode string.

    Examples:

    >>> import w3lib.html
    >>> w3lib.html.replace_tags(u'This text contains <a>some tag</a>')
    u'This text contains some tag'
    >>> w3lib.html.replace_tags('<p>Je ne parle pas <b>fran\xe7ais</b></p>', ' -- ', 'latin-1')
    u' -- Je ne parle pas  -- fran\xe7ais --  -- '
    >>>

    )_tag_rer1   r   )r   tokenr   s      r   replace_tagsr;   f   s    ( ;;ujx899r   z<!--.*?(?:-->|$)c                 F    t        | |      } t        j                  d|       S )z Remove HTML Comments.

    >>> import w3lib.html
    >>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
    u'test  whatever'
    >>>

    r   )r   _REMOVECOMMENTS_REr1   r6   s     r   remove_commentsr>   ~   s#     dH%D!!#t,,r   c                 f   r	rJ d       D ch c]  }|j                          c}D ch c]  }|j                          c}fdfd}d}t        j                  |t        j                  t        j                  z        }|j                  |t        | |            S c c}w c c}w )a1   Remove HTML Tags only.

    `which_ones` and `keep` are both tuples, there are four cases:

    ==============  ============= ==========================================
    ``which_ones``  ``keep``      what it does
    ==============  ============= ==========================================
    **not empty**   empty         remove all tags in ``which_ones``
    empty           **not empty** remove all tags except the ones in ``keep``
    empty           empty         remove all tags
    **not empty**   **not empty** not allowed
    ==============  ============= ==========================================


    Remove all tags:

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags(doc)
    u'This is a link: example'
    >>>

    Keep only some tags:

    >>> w3lib.html.remove_tags(doc, keep=('div',))
    u'<div>This is a link: example</div>'
    >>>

    Remove only specific tags:

    >>> w3lib.html.remove_tags(doc, which_ones=('a','b'))
    u'<div><p>This is a link: example</p></div>'
    >>>

    You can't remove some and keep some:

    >>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',))
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/usr/local/lib/python2.7/dist-packages/w3lib/html.py", line 101, in remove_tags
        assert not (which_ones and keep), 'which_ones and keep can not be given at the same time'
    AssertionError: which_ones and keep can not be given at the same time
    >>>

    z5which_ones and keep can not be given at the same timec                 8    | j                         } r| v S | vS r3   )r"   )tagr   
which_oness    r   will_removez remove_tags.<locals>.will_remove   s&    iik*$$d?"r   c                 \    | j                  d      } |      rdS | j                  d      S )N   r   r   )r#   )r+   rA   rC   s     r   
remove_tagzremove_tags.<locals>.remove_tag   s*    ggaj!#&s6AGGAJ6r   z</?([^ >/]+).*?>)r"   recompileDOTALL
IGNORECASEr1   r   )	r   rB   r   r   rA   rF   regexretagsrC   s	    ``     @r   remove_tagsrM      s    ^ t]&]]$)34##))+4J#'(4CCIIK4(D#7 EZZryy2==89F::j*T8"<==# 5(s
   B)B.c                    t        | |      } |rpdj                  |D cg c]  }d|d|d|d c}      }t        j                  |t        j                  t        j
                  z        }|j                  d|       } | S c c}w )a  Remove tags and their content.

    `which_ones` is a tuple of which tags to remove including their content.
    If is empty, returns the string unmodified.

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))
    u'<div><p> <a href="http://www.example.com">example</a></p></div>'
    >>>

    |<z\b.*?</z>|<z\s*/>r   )r   joinrG   rH   rI   rJ   r1   )r   rB   r   rA   tagsrL   s         r   remove_tags_with_contentrS      sr     dH%DxxPZ[PZc3DPZ[\D"))bmm";<zz#t$K \s   Bc                 d    t        | |      } |D ]  }| j                  |t        ||            }   | S )a$  Remove escape characters.

    `which_ones` is a tuple of which escape characters we want to remove.
    By default removes ``\n``, ``\t``, ``\r``.

    `replace_by` is the string to replace the escape characters by.
    It defaults to ``''``, meaning the escape characters are removed.

    )r   replace)r   rB   
replace_byr   ecs        r   replace_escape_charsrX      s6     dH%D||B
:x @A Kr   c                     d }t        | |      } d} || t              D ]B  }t        |t        j                        r|t        |||      z  }/||j                  d      z  }D |S )a`  
    This function receives markup as a text (always a unicode string or
    a UTF-8 encoded string) and does the following:

    1. removes entities (except the ones in `keep`) from any part of it
        that is not inside a CDATA
    2. searches for CDATAs and extracts their text (if any) without modifying it.
    3. removes the found CDATAs

    c              3      K   d}|j                  |       D ]#  }|j                  d      \  }}| ||  | |}% | |d   y w)Nr   rE   )finditerspan)txtpatternoffsetmatchmatch_smatch_es         r   _get_fragmentsz&unquote_markup.<locals>._get_fragments  sW     %%c*E$zz!}GWfW%%KF	 +
 &'ls   AAr   )r   r   cdata_d)r   	_cdata_re
isinstancer&   string_typesr   r#   )r   r   r   r   rc   ret_textfragments          r   unquote_markuprj      sk     dH%DH"43h 0 01(^\\H y11H 4 Or   c                     t        | |      } t        j                  |       }|rMt        j                  j
                  j                  t        |      t        |j                  d      |            S t        |      S )zReturn the base url if declared in the given HTML `text`,
    relative to the given base url.

    If no base url is found, the given `baseurl` is returned.

    rE   )r   )	r   _baseurl_rer5   r   urllibparseurljoinr   r#   )r   baseurlr   r+   s       r   get_base_urlrq     sg     dH%D4 A||!!))G$AGGAJ:
 	

 w''r   c                    t         j                  rt        ||      }	 t        | |      } t        | |      } t        t        |             } t        j                  |       }|rrt        |j                  d            }t        |j                  d      j                  d      |      }t        j                   j"                  j%                  ||      }||fS y# t        $ r t        |         w xY w)aY  Return  the http-equiv parameter of the HTML meta element from the given
    HTML text and return a tuple ``(interval, url)`` where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, ``(None, None)`` is returned.

    r!   urlz "')NN)r&   PY2r   r   UnicodeDecodeErrorprintrS   r>   r   _meta_refresh_rer5   floatr#   r   stripr   rm   rn   ro   )r   rp   r   ignore_tagsr+   intervalrs   s          r   get_meta_refreshr|   (  s     ww7H-$) $D+6D+D12D%A(aggen226:HEll  ((#6}  ds   C C*c                 ,    | j                  t              S )a  
    Strip all leading and trailing space characters (as defined in
    https://www.w3.org/TR/html5/infrastructure.html#space-character).

    Such stripping is useful e.g. for processing HTML element attributes which
    contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard
    defines them as "valid URL potentially surrounded by spaces"
    or "valid non-empty URL potentially surrounded by spaces".

    >>> strip_html5_whitespace(' hello\n')
    'hello'
    )ry   HTML5_WHITESPACE)r   s    r   strip_html5_whitespacer   E  s     ::&''r   ) Tutf-8r3   )r   N)r   r   N)r   N))
	r   N)r   TN)r   r   )r   r   )scriptnoscript)"__doc__r   rG   r&   r   
w3lib.utilr   r   	w3lib.urlr   rH   rJ   r0   rI   r9   uIrl   rw   re   r~   r   r   r7   r;   r=   r>   rM   rS   rX   rj   rq   r|   r   r   r   r   <module>r      sQ    	 
  + %
"**acecpcp
q
"**("))
4bjjWXZ\Z^Z^_2::ecee  %e  f  hj  hq  hq  tv  tA  tA  hA  B BJJVXZXaXab	  B$8Ct<:.  RZZ 3RYY? -B>H, JM"@($:(r   