
    n9dG:                         d Z dZdgZddlmZ ddlZddlZddlmZm	Z	m
Z
mZmZ ddlmZmZ ddlmZmZmZmZmZ d	Z G d
 dee          Z G d de          ZdS )zCUse the HTMLParser library to parse HTML files that aren't too bad.MITHTMLParserTreeBuilder    )
HTMLParserN)CDataCommentDeclarationDoctypeProcessingInstruction)EntitySubstitutionUnicodeDammit)DetectsXMLParsedAsHTMLParserRejectedMarkupHTMLHTMLTreeBuilderSTRICTzhtml.parserc                   f    e Zd ZdZdZdZd Zd Zd ZddZ	dd	Z
d
 Zd Zd Zd Zd Zd Zd ZdS )BeautifulSoupHTMLParserzA subclass of the Python standard library's HTMLParser class, which
    listens for HTMLParser events and translates them into calls
    to Beautiful Soup's tree construction API.
    ignorereplacec                     |                     d| j                  | _        t          j        | g|R i | g | _        |                                  dS )a  Constructor.

        :param on_duplicate_attribute: A strategy for what to do if a
            tag includes the same attribute more than once. Accepted
            values are: REPLACE (replace earlier values with later
            ones, the default), IGNORE (keep the earliest value
            encountered), or a callable. A callable must take three
            arguments: the dictionary of attributes already processed,
            the name of the duplicate attribute, and the most recent value
            encountered.           
        on_duplicate_attributeN)popREPLACEr   r   __init__already_closed_empty_element_initialize_xml_detector)selfargskwargss      7lib/python3.11/site-packages/bs4/builder/_htmlparser.pyr   z BeautifulSoupHTMLParser.__init__.   sf     '-jj$dl'
 '
# 	D242226222 -/)%%'''''    c                      t          |          )N)r   )r   messages     r    errorzBeautifulSoupHTMLParser.errorJ   s     #7+++r!   c                 `    |                      ||d          }|                     |           dS )zHandle an incoming empty-element tag.

        This is only called when the markup looks like <tag/>.

        :param name: Name of the tag.
        :param attrs: Dictionary of the tag's attributes.
        F)handle_empty_elementN)handle_starttaghandle_endtag)r   nameattrstags       r    handle_startendtagz*BeautifulSoupHTMLParser.handle_startendtagZ   s8     ""4U"KK4     r!   Tc                    i }|D ]F\  }}|d}||v r2| j         }|| j        k    rn$|d| j        fv r|||<   n ||||           n|||<   d}G|                                 \  }	}
| j                            |dd||	|
          }|r:|j        r3|r1|                     |d           | j        	                    |           | j
        |                     |           dS dS )a3  Handle an opening tag, e.g. '<tag>'

        :param name: Name of the tag.
        :param attrs: Dictionary of the tag's attributes.
        :param handle_empty_element: True if this tag is known to be
            an empty-element tag (i.e. there is not expected to be any
            closing tag).
        N z"")
sourceline	sourceposF)check_already_closed)r   IGNOREr   getpossoupr'   is_empty_elementr(   r   append	_root_tag_root_tag_encountered)r   r)   r*   r&   	attr_dictkeyvalueon_dupe	attrvaluer/   r0   r+   s               r    r'   z'BeautifulSoupHTMLParser.handle_starttagi   sG    	 	 	JC }i 5dk))t| 444%*IcNNGIsE2222!&	#II $
Ii''$iJ ( 
 
  	;3' 	;,@ 	; t%@@@ -44T:::>!&&t,,,,, "!r!   c                     |r%|| j         v r| j                             |           dS | j                            |           dS )zHandle a closing tag, e.g. '</tag>'
        
        :param name: A tag name.
        :param check_already_closed: True if this tag is expected to
           be the closing portion of an empty-element tag,
           e.g. '<tag></tag>'.
        N)r   remover4   r(   )r   r)   r1   s      r    r(   z%BeautifulSoupHTMLParser.handle_endtag   sT       	*DD,M$M$M
 -44T:::::I##D)))))r!   c                 :    | j                             |           dS )z4Handle some textual data that shows up between tags.N)r4   handle_datar   datas     r    rA   z#BeautifulSoupHTMLParser.handle_data   s    	d#####r!   c                 J   |                     d          r$t          |                    d          d          }nH|                     d          r$t          |                    d          d          }nt          |          }d}|dk     rO| j        j        dfD ]@}|s	 t          |g                              |          }*# t          $ r
}Y d}~9d}~ww xY w|s/	 t          |          }n# t          t          f$ r
}Y d}~nd}~ww xY w|pd}|                     |           dS )zHandle a numeric character reference by converting it to the
        corresponding Unicode character and treating it as textual
        data.

        :param name: Character number, possibly in hexadecimal.
        x   XN   zwindows-1252u   �)
startswithintlstripr4   original_encoding	bytearraydecodeUnicodeDecodeErrorchr
ValueErrorOverflowErrorrA   )r   r)   	real_namerC   encodinges         r    handle_charrefz&BeautifulSoupHTMLParser.handle_charref   sa    ??3 	"DKK,,b11II__S!! 	"DKK,,b11IID		Is?? "Y8.I   $i[1188BBDD)   DDDD 	9~~.   22s$   #C
CCC, ,DDc                     t           j                            |          }||}nd|z  }|                     |           dS )zHandle a named entity reference by converting it to the
        corresponding Unicode character(s) and treating it as textual
        data.

        :param name: Name of the entity reference.
        Nz&%s)r   HTML_ENTITY_TO_CHARACTERgetrA   )r   r)   	characterrC   s       r    handle_entityrefz(BeautifulSoupHTMLParser.handle_entityref   sL     '?CCDII	 DD 4<Dr!   c                     | j                                          | j                             |           | j                             t                     dS )zOHandle an HTML comment.

        :param data: The text of the comment.
        N)r4   endDatarA   r   rB   s     r    handle_commentz&BeautifulSoupHTMLParser.handle_comment   sJ    
 			d###	'"""""r!   c                     | j                                          |t          d          d         }| j                             |           | j                             t                     dS )zYHandle a DOCTYPE declaration.

        :param data: The text of the declaration.
        zDOCTYPE N)r4   r]   lenrA   r	   rB   s     r    handle_declz#BeautifulSoupHTMLParser.handle_decl   s`    
 		C
OO$$%	d###	'"""""r!   c                 :   |                                                     d          rt          }|t          d          d         }nt          }| j                                         | j                            |           | j                            |           dS )z{Handle a declaration of unknown type -- probably a CDATA block.

        :param data: The text of the declaration.
        zCDATA[N)upperrI   r   r`   r   r4   r]   rA   )r   rC   clss      r    unknown_declz$BeautifulSoupHTMLParser.unknown_decl  s    
 ::<<""8,, 	CH'DDC		d###	#r!   c                     | j                                          | j                             |           |                     |           | j                             t                     dS )z\Handle a processing instruction.

        :param data: The text of the instruction.
        N)r4   r]   rA   _document_might_be_xmlr
   rB   s     r    	handle_piz!BeautifulSoupHTMLParser.handle_pi  s_    
 			d#####D)))	/00000r!   N)T)__name__
__module____qualname____doc__r2   r   r   r$   r,   r'   r(   rA   rV   r[   r^   ra   re   rh    r!   r    r   r   $   s          FG( ( (8, , , ! ! !5- 5- 5- 5-n* * * *$$ $ $& & &P  &# # ## # #  1 1 1 1 1r!   r   c                   P     e Zd ZdZdZdZeZeee	gZ
dZd fd	Z	 	 d	dZd Z xZS )
r   zpA Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
    found in the Python standard library.
    FTNc                     t                      }dD ] }||v r|                    |          }|||<   ! t          t          |           j        di | |pg }|pi }|                    |           d|d<   ||f| _        dS )a  Constructor.

        :param parser_args: Positional arguments to pass into 
            the BeautifulSoupHTMLParser constructor, once it's
            invoked.
        :param parser_kwargs: Keyword arguments to pass into 
            the BeautifulSoupHTMLParser constructor, once it's
            invoked.
        :param kwargs: Keyword arguments for the superclass constructor.
        )r   Fconvert_charrefsNrm   )dictr   superr   r   updateparser_args)r   rt   parser_kwargsr   extra_parser_kwargsargr;   	__class__s          r    r   zHTMLParserTreeBuilder.__init__*  s     #ff. 	1 	1Cf}}

3+0#C(3#T**3==f===!'R%+0111,1()'7r!   c              #      K   t          |t                    r
|dddfV  dS |g}|g}||g}t          |||d|          }|j        |j        |j        |j        fV  dS )a  Run any preliminary steps necessary to make incoming markup
        acceptable to the parser.

        :param markup: Some markup -- probably a bytestring.
        :param user_specified_encoding: The user asked to try this encoding.
        :param document_declared_encoding: The markup itself claims to be
            in this encoding.
        :param exclude_encodings: The user asked _not_ to try any of
            these encodings.

        :yield: A series of 4-tuples:
         (markup, encoding, declared encoding,
          has undergone character replacement)

         Each 4-tuple represents a strategy for converting the
         document to Unicode and parsing it. Each strategy will be tried 
         in turn.
        NFT)known_definite_encodingsuser_encodingsis_htmlexclude_encodings)
isinstancestrr   markuprL   declared_html_encodingcontains_replacement_characters)	r   r   user_specified_encodingdocument_declared_encodingr}   rz   r{   try_encodingsdammits	            r    prepare_markupz$HTMLParserTreeBuilder.prepare_markupC  s      * fc"" 	4u----F %<#<  5502LM%=)/
 
 
 }f6,57 	7 	7 	7 	7 	7r!   c                     | j         \  }}t          |i |}| j        |_        	 |                    |           n!# t          $ r}t          |          d}~ww xY w|                                 g |_        dS )z{Run some incoming markup through some parsing process,
        populating the `BeautifulSoup` object in self.soup.
        N)rt   r   r4   feedAssertionErrorr   closer   )r   r   r   r   parserrU   s         r    r   zHTMLParserTreeBuilder.feedt  s     'f($9&99i	*KK 	* 	* 	* 'q)))		*
 	.0+++s   : 
AAA)NN)NNN)ri   rj   rk   rl   is_xml	picklable
HTMLPARSERNAMEr   r   featuresTRACKS_LINE_NUMBERSr   r   r   __classcell__)rx   s   @r    r   r     s          FIDdF#H 8 8 8 8 8 82 >BJN/7 /7 /7 /7b1 1 1 1 1 1 1r!   )rl   __license____all__html.parserr   syswarningsbs4.elementr   r   r   r	   r
   
bs4.dammitr   r   bs4.builderr   r   r   r   r   r   r   r   rm   r!   r    <module>r      se   I I   # " " " " " 



               9 8 8 8 8 8 8 8              
v1 v1 v1 v1 v1j*@ v1 v1 v1rf1 f1 f1 f1 f1O f1 f1 f1 f1 f1r!   