
    aV                        d Z ddlZddlZddlZddlZddlZddlZddlZddlm	Z	m
Z
 ddlZddlmZmZmZmZmZmZmZmZmZmZmZmZ ddlmZmZ ddlmZmZmZ d Z  ej!        de            d	Z"d
Z#e"e#z   Z$ej%        ej&        z   dz   '                    d          Z(dZ)e$e(z   e)z   dz   Z* ej+        d          Z,d9dZ- ej+        d          Z.d9dZ/d Z0d:dZ1d;dZ2d Z3d Z4d Z5d Z6d  Z7d! Z8 e9 e:e; e<d"                              Z=d#>                     ej?        d$@                    e= e9 e:e; e<dd%                              z
   e9d&          z
                                ZAd'>                     ej?        d$@                    e=h d(z
                       ej?        d$@                    e=                              ZB ej+        d)>                    eA*          '                                          ZC ej+        d+>                    eAeB,          '                                          ZD e	d-d.          ZEd/ ZFg d0ZGd9d1ZH	 	 d<d2ZId3 ZJd=d4ZKejL        sdd5lMmNZNmOZO d>d6ZPd?d8ZQdS )@zW
This module contains general purpose URL functions not found in the standard
library.
    N)
namedtupleOrderedDict)urljoinurlsplit
urlunsplit	urldefrag	urlencodeurlparsequoteparse_qs	parse_qslParseResultunquote
urlunparse)pathname2urlurl2pathname)to_bytesto_native_str
to_unicodec                 v    t          t          | j        | j        | j                                     | j        fS N)r   r   objectstartend)errors    )lib/python3.11/site-packages/w3lib/url.py_quote_byter      s/    uU\%+ei*?@AABBEINN    percentencodes   :/?#[]@s   !$&'()*+,;=z-._~ascii   |   %z[\t\n\r]utf8c                 L   t          | |d          }t          t                              d|                    }	 |j                            d          }n# t          $ r
 |j        }Y nw xY wt          t          |j	                  t          |          
                    d          t          t          |j        |          t                    t          t          |j        |          t                    t          t          |j        |          t                    f          S )a  Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986. Also, ASCII tabs and newlines are removed
    as per https://url.spec.whatwg.org/#url-parsing.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
    URL path component (unless overriden by path_encoding), and given
    encoding is used for query string or form data.
    When passing an encoding, you should use the encoding of the
    original page (the page from which the URL was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    r   )encodingerrors idna:)r   r   _ascii_tab_newline_resubnetlocencodeUnicodeErrorr   r   schemerstripr   r   path_safe_charsqueryfragment)urlr%   path_encodingdecodedpartsr,   s         r   safe_url_stringr9   '   s   . xHHHG*..r7;;<<E$$V,,   
 el##f$$S)) 	huz=11;?? 	hu{H--{;;hu~x00+>>   s   A A+*A+z
/?(\.\./)+c                 8   t          | ||          }t          |          \  }}}}}|r]t                              dt	          j        |                    }|                    d          r|                    d          s|dz  }nd}t          ||||df          S )z Make a url for download. This will call safe_url_string
    and then strip the fragment, if one exists. The path will
    be normalised.

    If the path is outside the document root, it will be changed
    to be within the document root.
    r'   /)r9   r   _parent_dirsr+   	posixpathnormpathendswithr   )	r5   r%   r6   safe_urlr/   r,   r1   r3   _s	            r   safe_download_urlrB   Z   s     sHm<<H%-h%7%7"FFD% I$6t$<$<==S!! 	$--*<*< 	CKDvvtUB7888r   c                 <    |                      d          d         dv S )Nz://r   )filehttphttps)	partition)texts    r   is_urlrI   m   s    >>%  #'@@@r   c                     t          t          t          |                     d         |          }|                    ||g          d         S )a  Return the value of a url parameter, given the url and parameter name

    General case:

    >>> import w3lib.url
    >>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "id")
    '200'
    >>>

    Return a default value if the parameter is not found:

    >>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault")
    'mydefault'
    >>>

    Returns None if `keep_blank_values` not set or 0 (default):

    >>> w3lib.url.url_query_parameter("product.html?id=", "id")
    >>>

    Returns an empty string if `keep_blank_values` set to 1:

    >>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=1)
    ''
    >>>

       keep_blank_valuesr   )r   r   strget)r5   	parameterdefaultrM   queryparamss        r   url_query_parameterrS   q   sN    : S1+  K ??9wi0033r    &=FTc                    t          |t          j        t          f          r|g}t	          |           \  } }|                     d          \  }}	}
t                      }g }|
                    |          D ]]}|s|                    |          \  }}	}	|r||v r%|r||v r,|s||vr3|                    |           |	                    |           ^|r*d
                    ||
                    |          g          n|} |r| d|z   z  } | S )a  Clean URL arguments leaving only those passed in the parameterlist keeping order

    >>> import w3lib.url
    >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',))
    'product.html?id=200'
    >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])
    'product.html?id=200&name=wired'
    >>>

    If `unique` is ``False``, do not remove duplicated keys

    >>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False)
    'product.html?d=1&d=2&d=3'
    >>>

    If `remove` is ``True``, leave only those **not in parameterlist**.

    >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True)
    'product.html?foo=bar&name=wired'
    >>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True)
    'product.html?name=wired'
    >>>

    By default, URL fragments are removed. If you need to preserve fragments,
    pass the ``keep_fragments`` argument as ``True``.

    >>> w3lib.url.url_query_cleaner('http://domain.tld/?bla=123#123123', ['bla'], remove=True, keep_fragments=True)
    'http://domain.tld/#123123'

    ?#)
isinstancesix	text_typebytesr   rG   setsplitappendaddjoin)r5   parameterlistsepkvsepremoveuniquekeep_fragmentsr4   baserA   r3   seen	querylistksvks                  r   url_query_cleanerrn      s?   @ -#-!788 (&cNNMC]]3''ND!U55DI{{3   	--&&1a 	a4ii 	]** 	A]22S!!!HHQKKKK3<
F#((D#((9--.
/
/
/$C sX~Jr   c                     t          |           }t          |j        d          }t          |          }|                    |           t          |          }t          |                    |                    S )NTrL   )r3   )r   r   r3   r   updater	   r   _replace)r5   paramsparsedargsnew_argsr3   s         r   _add_or_replace_parametersrv      si    c]]FV\T:::D4  HOOFhEfooEo22333r   c                 &    t          | ||i          S )aN  Add or remove a parameter to a given url

    >>> import w3lib.url
    >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php', 'arg', 'v')
    'http://www.example.com/index.php?arg=v'
    >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg4', 'v4')
    'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3&arg4=v4'
    >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg3', 'v3new')
    'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new'
    >>>

    rv   )r5   name	new_values      r   add_or_replace_parameterr{      s     &cD)+<===r   c                 "    t          | |          S )a  Add or remove a parameters to a given url

    >>> import w3lib.url
    >>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php', {'arg': 'v'})
    'http://www.example.com/index.php?arg=v'
    >>> args = {'arg4': 'v4', 'arg3': 'v3new'}
    >>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', args)
    'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new&arg4=v4'
    >>>

    rx   )r5   new_parameterss     r   add_or_replace_parametersr~      s     &c>:::r   c                     t          t          j                            |                     }t          j        dk    r|                    dd          }d|                    d          z  S )zwConvert local filesystem path to legal File URIs as described in:
    http://en.wikipedia.org/wiki/File_URI_scheme
    nt|r)   z
file:///%sr;   )r   osr1   abspathry   replacelstrip)r1   xs     r   path_to_file_urir      sR     	RW__T**++A	w$IIc3!((3--''r   c                 H    t          |           j        }t          |          S )zmConvert File URI to local filesystem path according to:
    http://en.wikipedia.org/wiki/File_URI_scheme
    )r
   r1   r   )uriuri_paths     r   file_uri_to_pathr     s      }}!H!!!r   c                     t           j                            |           d         rt          |           S t	          |           }|j        r| nt          |           S )zRIf given a path name, return its File URI, otherwise return it
    unmodified
    r   )r   r1   
splitdriver   r
   r/   )uri_or_pathus     r   
any_to_urir   
  sV     
w+&&q) -,,,A(E;;(8(E(EEr      z[{}]+r'       z()<>@,;:\"/[]?= z(?:[{}]|(?:\\[{}]))*>   "\z{token}/{token})tokenz%;({token})=(?:({token})|"({quoted})"))r   quotedParseDataURIResultz%media_type media_type_parameters datac                 f   t          | t                    s"t          |                               d          } 	 |                     dd          \  }} n# t
          $ r t          d          w xY w|                                dk    rt          d          t          j        rt          |           } nt          |           } d}i }t                              |           }|rC|                                                                }| |                                d         } nd	|d
<   	 t                               |           }|rv|                                \  }}}|rt%          j        dd|          }|                                ||                                <   | |                                d         } nn	 |                     dd          \  }}	n# t
          $ r t          d          w xY w|r)|dk    rt          d          t)          j        |	          }	t-          |||	          S )zt

    Parse a data: URI, returning a 3-tuple of media type, dictionary of media
    type parameters, and data.

    r       :   zinvalid URIs   dataznot a data URIz
text/plainNzUS-ASCIIcharsetTs   \\(.)z\1   ,zinvalid data URIs   ;base64)rZ   r]   r9   r-   r_   
ValueErrorlowerr[   PY2r   unquote_to_bytes_mediatype_patternmatchgroupdecoder   _mediatype_parameter_patterngroupsrer+   base64	b64decode_ParseDataURIResult)
r   r/   
media_typemedia_type_paramsm	attributevaluevalue_quoted	is_base64datas
             r   parse_data_urir   6  s;    c5!! 3c""))'22(iia(( ( ( ('''(||~~  )*** w $clls##J  %%A 2WWYY%%''
!%%''((m'1)$	(..s33 	-.XXZZ*Iul ?y%>>49LLNNi..001aeegghh-CC	-))D!,,	44 - - -+,,,- &
""/000%%z+<dCCCs   A A- G G4)r{   r~   r   canonicalize_urlr   rI   r   r   rB   r9   rn   rS   urljoin_rfcc                    	 | j                             d          }n# t          $ r
 | j         }Y nw xY wt          | j                  t          |          t          t          | j        |          t                    t          t          | j	        |          t                    t          t          | j
        |          t                    t          t          | j        |          t                    fS )Nr(   )r,   r-   r.   r   r/   r   r   r1   r2   rr   r3   r4   )r8   r%   r6   r,   s       r   _safe_ParseResultr     s    $$V,,    	el##f 	huz=11;??hu|]33[AA 	hu{H--{;;hu~x00+>> s    11c                 4   	 t          t          |           |          \  }}}}}}	n<# t          $ r/}
t          t          |           d          \  }}}}}}	Y d}
~
nd}
~
ww xY wt          j        rt          ||          }nt          ||          }|                                 t          |          }t          |          }t          |t                    pd}|sdn|	}	t          ||                                                    d          ||||	f          S )a  Canonicalize the given url by applying the following procedures:

    - sort query arguments, first by key, then by value
    - percent encode paths ; non-ASCII characters are percent-encoded
      using UTF-8 (RFC-3986)
    - percent encode query arguments ; non-ASCII characters are percent-encoded
      using passed `encoding` (UTF-8 by default)
    - normalize all spaces (in query arguments) '+' (plus symbol)
    - normalize percent encodings case (%2f -> %2F)
    - remove query arguments with blank values (unless `keep_blank_values` is True)
    - remove fragments (unless `keep_fragments` is True)

    The url passed can be bytes or unicode, while the url returned is
    always a native str (bytes in Python 2, unicode in Python 3).

    >>> import w3lib.url
    >>>
    >>> # sorting query arguments
    >>> w3lib.url.canonicalize_url('http://www.example.com/do?c=3&b=5&b=2&a=50')
    'http://www.example.com/do?a=50&b=2&b=5&c=3'
    >>>
    >>> # UTF-8 conversion + percent-encoding of non-ASCII characters
    >>> w3lib.url.canonicalize_url(u'http://www.example.com/r\u00e9sum\u00e9')
    'http://www.example.com/r%C3%A9sum%C3%A9'
    >>>

    For more examples, see the tests in `tests/test_url.py`.
    )r%   r#   Nr;   r'   r)   )r   	parse_urlUnicodeEncodeErrorr[   r   r   parse_qsl_to_bytessortr	   _unquotepathr   r2   r   r   r0   )r5   rM   rh   r%   r/   r,   r1   rr   r3   r4   ekeyvalsuqps                r   r   r     sF   F-8IcNNX9/ 9/ 9/5feXX - - -8IcNNV9- 9- 9-5feXXXXXX- w ?E#4552 %U,=>>LLNNNgE t

Ck"")cD'5rrXH v||~~,,S11! " " "s   %( 
A!%AA!c                     dD ]0}|                      d|z   d|                                z             } 1t          j        rt	          |           S t          |           S )N)2f2F3f3F%z%25)r   upperr[   r   r   r   )r1   reserveds     r   r   r     sa    , F F||C(NEHNN4D4D,DEE
w 
&t}}  %%%r   c                 j    t          | t                    r| S t          t          | |                    S )z\Return urlparsed url from the given argument (which could be an already
    parsed url)
    )rZ   r   r
   r   )r5   r%   s     r   r   r   	  s3     #{## 
JsH--...r   )_coerce_argsr   c                 4   t          |           \  } }d |                     d          D             }g }|D ]}|s|                    dd          }t          |          dk    r|r|                    d           nGt          |d                   s|r|d                             dd	          }t          |          } ||          }|d                             dd	          }t          |          } ||          }|                    ||f           |S )
a-  Parse a query given as a string argument.

        Data are returned as a list of name, value pairs as bytes.

        Arguments:

        qs: percent-encoded query string to be parsed

        keep_blank_values: flag indicating whether blank values in
            percent-encoded queries should be treated as blank strings.  A
            true value indicates that blanks should be retained as blank
            strings.  The default false value indicates that blank values
            are to be ignored and treated as if they were  not included.

        c                 B    g | ]}|                     d           D ]}|S );)r_   ).0s1s2s      r   
<listcomp>z&parse_qsl_to_bytes.<locals>.<listcomp>*  s/    DDDbhhsmmDDDDDDr   rU   rV   r      r'   r   + )r   r_   lenr`   r   r   )	qsrM   _coerce_resultpairsr
name_valuenvry   r   s	            r   r   r     s.   ( *"--NDDbhhsmmDDD 	( 	(J !!#q))B2ww!||$ IIbMMMM2a5zz (. (!u}}S#..'--%~d++1c3//(//&u--$'''r   utf-8c                     t          j        dt                     t          | |          }t          ||          }t	          ||          S )a1  
    .. warning::

        This function is deprecated and will be removed in future.
        It is not supported with Python 3.
        Please use ``urlparse.urljoin`` instead.

    Same as urlparse.urljoin but supports unicode values in base and ref
    parameters (in which case they will be converted to str using the given
    encoding).

    Always returns a str.

    >>> import w3lib.url
    >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html')
    'http://www.example.com/otherpath/index2.html'
    >>>

    >>> # Note: the following does not work in Python 3
    >>> w3lib.url.urljoin_rfc(b'http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm') # doctest: +SKIP
    'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm'
    >>>


    zAw3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead)warningswarnDeprecationWarningr   r   )ri   refr%   str_basestr_refs        r   r   r   A  sN    6 MU   h''HsH%%G8W%%%r   )r#   r#   )Nr   )rT   rU   rV   FTF)TFNr   )F)r   )R__doc__r   codecsr   r   r=   r   stringcollectionsr   r   r[   six.moves.urllib.parser   r   r   r   r	   r
   r   r   r   r   r   r   six.moves.urllib.requestr   r   
w3lib.utilr   r   r   r   register_errorRFC3986_GEN_DELIMSRFC3986_SUB_DELIMSRFC3986_RESERVEDascii_lettersdigitsr-   RFC3986_UNRESERVEDEXTRA_SAFE_CHARSr2   compiler*   r9   r<   rB   rI   rS   rn   rv   r{   r~   r   r   r   r^   mapchrrange_charformatescaperb   _token_quoted_stringr   r   r   r   __all__r   r   r   r   r   urllib.parser   r   r   r   rT   r   r   <module>r      s      				 				       / / / / / / / / 



F F F F F F F F F F F F F F F F F F F F F F F F F F F F @ ? ? ? ? ? ? ? : : : : : : : : : :O O O  o{ 3 3 3   # %(:: *V]:VCKKGTT  !336FFM"
;// . . . .b rz-((9 9 9 9&A A A!4 !4 !4 !4H6 6 6 6r4 4 4> > > ; ; ;( ( (" " "F F F 	CCUU3ZZ  !! 
2775+.3ss3a/E/E+F+F,G ,/3/B+C+C	,D $E $E F F 
G 
G )//BIbgge////0011BIbggenn   RZF++2244    *rz,33&;I 4 6 66<fhh     !j!5!HJ J <D <D <D~  "   . BG"X" X" X" X"v& & &"/ / / / w ,;;;;;;;;) ) ) )X &  &  &  &  &  &r   