import re import warnings import weakref import unittest import pickle import typing from typing import cast, Any, Optional, Mapping from lxml import etree from lxml.html import HtmlElement from pkg_resources import parse_version from parsel import Selector, SelectorList from parsel.selector import ( CannotRemoveElementWithoutRoot, CannotRemoveElementWithoutParent, LXML_SUPPORTS_HUGE_TREE, _NOT_SET, ) class SelectorTestCase(unittest.TestCase): sscls = Selector def assertIsSelector(self, value: Any) -> None: self.assertEqual(type(value), type(self.sscls(text=""))) def assertIsSelectorList(self, value: Any) -> None: self.assertEqual(type(value), type(self.sscls.selectorlist_cls())) def test_pickle_selector(self) -> None: sel = self.sscls(text="

some text

") self.assertRaises( TypeError, lambda s: pickle.dumps(s, protocol=2), sel ) def test_pickle_selector_list(self) -> None: sel = self.sscls( text="

" ) sel_list = sel.css("li") empty_sel_list = sel.css("p") self.assertIsSelectorList(sel_list) self.assertIsSelectorList(empty_sel_list) self.assertRaises( TypeError, lambda s: pickle.dumps(s, protocol=2), sel_list ) self.assertRaises( TypeError, lambda s: pickle.dumps(s, protocol=2), empty_sel_list ) def test_simple_selection(self) -> None: """Simple selector tests""" body = "

" sel = self.sscls(text=body) xl = sel.xpath("//input") self.assertEqual(2, len(xl)) for x in xl: self.assertIsSelector(x) self.assertEqual( sel.xpath("//input").extract(), [x.extract() for x in sel.xpath("//input")], ) self.assertEqual( [x.extract() for x in sel.xpath("//input[@name='a']/@name")], ["a"], ) self.assertEqual( [ x.extract() for x in sel.xpath( "number(concat(//input[@name='a']/@value, //input[@name='b']/@value))" ) ], ["12.0"], ) self.assertEqual( sel.xpath("concat('xpath', 'rules')").extract(), ["xpathrules"] ) self.assertEqual( [ x.extract() for x in sel.xpath( "concat(//input[@name='a']/@value, //input[@name='b']/@value)" ) ], ["12"], ) def test_simple_selection_with_variables(self) -> None: """Using XPath variables""" body = "

" sel = self.sscls(text=body) self.assertEqual( [ x.extract() for x in sel.xpath("//input[@value=$number]/@name", number=1) ], ["a"], ) self.assertEqual( [ x.extract() for x in sel.xpath("//input[@name=$letter]/@value", letter="b") ], ["2"], ) self.assertEqual( sel.xpath( "count(//input[@value=$number or @name=$letter])", number=2, letter="a", ).extract(), ["2.0"], ) # you can also pass booleans self.assertEqual( sel.xpath( "boolean(count(//input)=$cnt)=$test", cnt=2, test=True ).extract(), ["1"], ) self.assertEqual( sel.xpath( "boolean(count(//input)=$cnt)=$test", cnt=4, test=True ).extract(), ["0"], ) self.assertEqual( sel.xpath( "boolean(count(//input)=$cnt)=$test", cnt=4, test=False ).extract(), ["1"], ) # for named nodes, you need to use "name()=node_name" self.assertEqual( sel.xpath( "boolean(count(//*[name()=$tag])=$cnt)=$test", tag="input", cnt=2, test=True, ).extract(), ["1"], ) def test_simple_selection_with_variables_escape_friendly(self) -> None: """Using XPath variables with quotes that would need escaping with string formatting""" body = """

I'm mixing single and "double quotes" and I don't care :)

""" sel = self.sscls(text=body) t = 'I say "Yeah!"' # naive string formatting with give something like: # ValueError: XPath error: Invalid predicate in //input[@value="I say "Yeah!""]/@name self.assertRaises( ValueError, sel.xpath, f'//input[@value="{t}"]/@name' ) # with XPath variables, escaping is done for you self.assertEqual( [ x.extract() for x in sel.xpath("//input[@value=$text]/@name", text=t) ], ["a"], ) lt = """I'm mixing single and "double quotes" and I don't care :)""" # the following gives you something like # ValueError: XPath error: Invalid predicate in //p[normalize-space()='I'm mixing single and "double quotes" and I don't care :)']//@name self.assertRaises( ValueError, sel.xpath, f"//p[normalize-space()='{lt}']//@name" ) self.assertEqual( [ x.extract() for x in sel.xpath( "//p[normalize-space()=$lng]//@name", lng=lt ) ], ["a"], ) def test_accessing_attributes(self) -> None: body = """

""" sel = self.sscls(text=body) self.assertEqual({"lang": "en", "version": "1.0"}, sel.attrib) self.assertEqual( {"id": "some-list", "class": "list-cls"}, sel.css("ul")[0].attrib ) # for a SelectorList, bring the attributes of first-element only self.assertEqual( {"id": "some-list", "class": "list-cls"}, sel.css("ul").attrib ) self.assertEqual( {"class": "item-cls", "id": "list-item-1"}, sel.css("li").attrib ) self.assertEqual({}, sel.css("body").attrib) self.assertEqual({}, sel.css("non-existing-element").attrib) self.assertEqual( [ {"class": "item-cls", "id": "list-item-1"}, {"class": "item-cls active", "id": "list-item-2"}, {"class": "item-cls", "id": "list-item-3"}, ], [e.attrib for e in sel.css("li")], ) def test_representation_slice(self) -> None: body = f"

" sel = self.sscls(text=body) representation = ( f"" ) self.assertEqual( [repr(it) for it in sel.xpath("//input/@name")], [representation] ) def test_representation_unicode_query(self) -> None: body = f"

" representation = ( "" ) sel = self.sscls(text=body) self.assertEqual( [repr(it) for it in sel.xpath('//input[@value="\xa9"]/@value')], [representation], ) def test_check_text_argument_type(self) -> None: self.assertRaisesRegex( TypeError, "text argument should be of type", self.sscls, b"", ) def test_extract_first(self) -> None: """Test if extract_first() returns first element""" body = '

' sel = self.sscls(text=body) self.assertEqual( sel.xpath("//ul/li/text()").extract_first(), sel.xpath("//ul/li/text()").extract()[0], ) self.assertEqual( sel.xpath('//ul/li[@id="1"]/text()').extract_first(), sel.xpath('//ul/li[@id="1"]/text()').extract()[0], ) self.assertEqual( sel.xpath("//ul/li[2]/text()").extract_first(), sel.xpath("//ul/li/text()").extract()[1], ) self.assertEqual( sel.xpath('/ul/li[@id="doesnt-exist"]/text()').extract_first(), None, ) def test_extract_first_default(self) -> None: """Test if extract_first() returns default value when no results found""" body = '

' sel = self.sscls(text=body) self.assertEqual( sel.xpath("//div/text()").extract_first(default="missing"), "missing", ) def test_selector_get_alias(self) -> None: """Test if get() returns extracted value on a Selector""" body = '

' sel = self.sscls(text=body) self.assertEqual( sel.xpath("//ul/li[position()>1]")[0].get(), '

' ) self.assertEqual( sel.xpath("//ul/li[position()>1]/text()")[0].get(), "2" ) def test_selector_getall_alias(self) -> None: """Test if get() returns extracted value on a Selector""" body = '

' sel = self.sscls(text=body) self.assertListEqual( sel.xpath("//ul/li[position()>1]")[0].getall(), ['

'], ) self.assertListEqual( sel.xpath("//ul/li[position()>1]/text()")[0].getall(), ["2"] ) def test_selectorlist_get_alias(self) -> None: """Test if get() returns first element for a selection call""" body = '

' sel = self.sscls(text=body) self.assertEqual(sel.xpath("//ul/li").get(), '

') self.assertEqual(sel.xpath("//ul/li/text()").get(), "1") def test_re_first(self) -> None: """Test if re_first() returns first matched element""" body = '

' sel = self.sscls(text=body) self.assertEqual( sel.xpath("//ul/li/text()").re_first(r"\d"), sel.xpath("//ul/li/text()").re(r"\d")[0], ) self.assertEqual( sel.xpath('//ul/li[@id="1"]/text()').re_first(r"\d"), sel.xpath('//ul/li[@id="1"]/text()').re(r"\d")[0], ) self.assertEqual( sel.xpath("//ul/li[2]/text()").re_first(r"\d"), sel.xpath("//ul/li/text()").re(r"\d")[1], ) self.assertEqual(sel.xpath("/ul/li/text()").re_first(r"\w+"), None) self.assertEqual( sel.xpath('/ul/li[@id="doesnt-exist"]/text()').re_first(r"\d"), None, ) self.assertEqual(sel.re_first(r'id="(\d+)'), "1") self.assertEqual(sel.re_first(r"foo"), None) self.assertEqual(sel.re_first(r"foo", default="bar"), "bar") def test_extract_first_re_default(self) -> None: """Test if re_first() returns default value when no results found""" body = '

' sel = self.sscls(text=body) self.assertEqual( sel.xpath("//div/text()").re_first(r"\w+", default="missing"), "missing", ) self.assertEqual( sel.xpath("/ul/li/text()").re_first(r"\w+", default="missing"), "missing", ) def test_select_unicode_query(self) -> None: body = "

" sel = self.sscls(text=body) self.assertEqual( sel.xpath('//input[@name="\xa9"]/@value').extract(), ["1"] ) def test_list_elements_type(self) -> None: """Test Selector returning the same type in selection methods""" text = "

test

" self.assertEqual( type(self.sscls(text=text).xpath("//p")[0]), type(self.sscls(text=text)), ) self.assertEqual( type(self.sscls(text=text).css("p")[0]), type(self.sscls(text=text)), ) def test_boolean_result(self) -> None: body = "

" xs = self.sscls(text=body) self.assertEqual( xs.xpath("//input[@name='a']/@name='a'").extract(), ["1"] ) self.assertEqual( xs.xpath("//input[@name='a']/@name='n'").extract(), ["0"] ) def test_differences_parsing_xml_vs_html(self) -> None: """Test that XML and HTML Selector's behave differently""" # some text which is parsed differently by XML and HTML flavors text = '

Hello

' hs = self.sscls(text=text, type="html") self.assertEqual( hs.xpath("//div").extract(), ['

Hello

'], ) xs = self.sscls(text=text, type="xml") self.assertEqual( xs.xpath("//div").extract(), ['

Hello

'], ) def test_error_for_unknown_selector_type(self) -> None: self.assertRaises(ValueError, self.sscls, text="", type="_na_") def test_text_or_root_is_required(self) -> None: self.assertRaisesRegex( ValueError, "Selector needs text, body, or root arguments", self.sscls, ) def test_bool(self) -> None: text = 'false true' hs = self.sscls(text=text, type="html") falsish = hs.xpath("//a/@href")[0] self.assertEqual(falsish.extract(), "") self.assertFalse(falsish) trueish = hs.xpath("//a/@href")[1] self.assertEqual(trueish.extract(), "nonempty") self.assertTrue(trueish) def test_slicing(self) -> None: text = "

" hs = self.sscls(text=text, type="html") self.assertIsSelector(hs.css("p")[2]) self.assertIsSelectorList(hs.css("p")[2:3]) self.assertIsSelectorList(hs.css("p")[:2]) self.assertEqual(hs.css("p")[2:3].extract(), ["

"]) self.assertEqual(hs.css("p")[1:3].extract(), ["

", "

"]) def test_nested_selectors(self) -> None: """Nested selector tests""" body = """

four
five
six

""" x = self.sscls(text=body) divtwo = x.xpath('//div[@class="two"]') self.assertEqual( divtwo.xpath("//li").extract(), [ "

one

", "

two

", "

four

", "

five

", "

six

", ], ) self.assertEqual( divtwo.xpath("./ul/li").extract(), ["

four

", "

five

", "

six

"], ) self.assertEqual( divtwo.xpath(".//li").extract(), ["

four

", "

five

", "

six

"], ) self.assertEqual(divtwo.xpath("./li").extract(), []) def test_selectorlist_getall_alias(self) -> None: """Nested selector tests using getall()""" body = """

four
five
six

""" x = self.sscls(text=body) divtwo = x.xpath('//div[@class="two"]') self.assertEqual( divtwo.xpath("//li").getall(), [ "

one

", "

two

", "

four

", "

five

", "

six

", ], ) self.assertEqual( divtwo.xpath("./ul/li").getall(), ["

four

", "

five

", "

six

"], ) self.assertEqual( divtwo.xpath(".//li").getall(), ["

four

", "

five

", "

six

"], ) self.assertEqual(divtwo.xpath("./li").getall(), []) def test_mixed_nested_selectors(self) -> None: body = """

notme

text

foo

""" sel = self.sscls(text=body) self.assertEqual( sel.xpath('//div[@id="1"]').css("span::text").extract(), ["me"] ) self.assertEqual( sel.css("#1").xpath("./span/text()").extract(), ["me"] ) def test_dont_strip(self) -> None: sel = self.sscls(text='

fff: zzz

') self.assertEqual(sel.xpath("//text()").extract(), ["fff: ", "zzz"]) def test_namespaces_simple(self) -> None: body = """ take this found """ x = self.sscls(text=body, type="xml") x.register_namespace("somens", "http://scrapy.org") self.assertEqual(x.xpath("//somens:a/text()").extract(), ["take this"]) def test_namespaces_adhoc(self) -> None: body = """ take this found """ x = self.sscls(text=body, type="xml") self.assertEqual( x.xpath( "//somens:a/text()", namespaces={"somens": "http://scrapy.org"}, ).extract(), ["take this"], ) def test_namespaces_adhoc_variables(self) -> None: body = """ take this found """ x = self.sscls(text=body, type="xml") self.assertEqual( x.xpath( "//somens:a/following-sibling::a[@id=$identifier]/text()", namespaces={"somens": "http://scrapy.org"}, identifier="bar", ).extract(), ["found"], ) def test_namespaces_multiple(self) -> None: body = """ hello value iron90Dried Rose """ x = self.sscls(text=body, type="xml") x.register_namespace( "xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05", ) x.register_namespace("p", "http://www.scrapy.org/product") x.register_namespace("b", "http://somens.com") self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1) self.assertEqual(x.xpath("//b:Operation/text()").extract()[0], "hello") self.assertEqual( x.xpath("//xmlns:TestTag/@b:att").extract()[0], "value" ) self.assertEqual( x.xpath("//p:SecondTestTag/xmlns:price/text()").extract()[0], "90" ) self.assertEqual( x.xpath("//p:SecondTestTag") .xpath("./xmlns:price/text()")[0] .extract(), "90", ) self.assertEqual( x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], "iron", ) def test_namespaces_multiple_adhoc(self) -> None: body = """ hello value iron90Dried Rose """ x = self.sscls(text=body, type="xml") x.register_namespace( "xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05", ) self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1) # "b" namespace is not declared yet self.assertRaises(ValueError, x.xpath, "//xmlns:TestTag/@b:att") # "b" namespace being passed ad-hoc self.assertEqual( x.xpath( "//b:Operation/text()", namespaces={"b": "http://somens.com"} ).extract()[0], "hello", ) # "b" namespace declaration is not cached self.assertRaises(ValueError, x.xpath, "//xmlns:TestTag/@b:att") # "xmlns" is still defined self.assertEqual( x.xpath( "//xmlns:TestTag/@b:att", namespaces={"b": "http://somens.com"}, ).extract()[0], "value", ) # chained selectors still have knowledge of register_namespace() operations self.assertEqual( x.xpath( "//p:SecondTestTag", namespaces={"p": "http://www.scrapy.org/product"}, ) .xpath("./xmlns:price/text()")[0] .extract(), "90", ) # but chained selector don't know about parent ad-hoc declarations self.assertRaises( ValueError, x.xpath( "//p:SecondTestTag", namespaces={"p": "http://www.scrapy.org/product"}, ).xpath, "p:name/text()", ) # ad-hoc declarations need repeats when chaining self.assertEqual( x.xpath( "//p:SecondTestTag", namespaces={"p": "http://www.scrapy.org/product"}, ) .xpath( "p:name/text()", namespaces={"p": "http://www.scrapy.org/product"}, ) .extract_first(), "Dried Rose", ) # declaring several ad-hoc namespaces self.assertEqual( x.xpath( "string(//b:Operation/following-sibling::xmlns:TestTag" "/following-sibling::*//p:name)", namespaces={ "b": "http://somens.com", "p": "http://www.scrapy.org/product", }, ).extract_first(), "Dried Rose", ) # "p" prefix is not cached from previous calls self.assertRaises( ValueError, x.xpath, "//p:SecondTestTag/xmlns:price/text()" ) x.register_namespace("p", "http://www.scrapy.org/product") self.assertEqual( x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], "iron", ) def test_make_links_absolute(self) -> None: text = 'link to file' sel = Selector(text=text, base_url="http://example.com") typing.cast(HtmlElement, sel.root).make_links_absolute() self.assertEqual( "http://example.com/file.html", sel.xpath("//a/@href").extract_first(), ) def test_re(self) -> None: body = """

Name: Mary

Name: John
Age: 10
Name: Paul
Age: 20

Age: 20

""" x = self.sscls(text=body) name_re = re.compile(r"Name: (\w+)") self.assertEqual(x.xpath("//ul/li").re(name_re), ["John", "Paul"]) self.assertEqual(x.xpath("//ul/li").re(r"Age: (\d+)"), ["10", "20"]) # Test named group, hit and miss x = self.sscls(text="foobar") self.assertEqual(x.re("(?Pfoo)"), ["foo"]) self.assertEqual(x.re("(?Pbaz)"), []) # A purposely constructed test for an edge case x = self.sscls(text="baz") self.assertEqual(x.re("(?Pfoo)|(?Pbaz)"), []) def test_re_replace_entities(self) -> None: body = """""" x = self.sscls(text=body) name_re = re.compile('{"foo":(.*)}') # by default, only & and < are preserved ; # other entities are converted expected = '"bar & "baz""' self.assertEqual(x.xpath("//script/text()").re(name_re), [expected]) self.assertEqual(x.xpath("//script").re(name_re), [expected]) self.assertEqual(x.xpath("//script/text()")[0].re(name_re), [expected]) self.assertEqual(x.xpath("//script")[0].re(name_re), [expected]) # check that re_first() works the same way for single value output self.assertEqual(x.xpath("//script").re_first(name_re), expected) self.assertEqual(x.xpath("//script")[0].re_first(name_re), expected) # switching off replace_entities will preserve " also expected = '"bar & "baz""' self.assertEqual( x.xpath("//script/text()").re(name_re, replace_entities=False), [expected], ) self.assertEqual( x.xpath("//script")[0].re(name_re, replace_entities=False), [expected], ) self.assertEqual( x.xpath("//script/text()").re_first( name_re, replace_entities=False ), expected, ) self.assertEqual( x.xpath("//script")[0].re_first(name_re, replace_entities=False), expected, ) def test_re_intl(self) -> None: body = "

Evento: cumplea\xf1os

" x = self.sscls(text=body) self.assertEqual( x.xpath("//div").re(r"Evento: (\w+)"), ["cumplea\xf1os"] ) def test_selector_over_text(self) -> None: hs = self.sscls(text="lala") self.assertEqual( hs.extract(), "lala" ) xs = self.sscls(text="lala", type="xml") self.assertEqual(xs.extract(), "lala") self.assertEqual(xs.xpath(".").extract(), ["lala"]) def test_invalid_xpath(self) -> None: "Test invalid xpath raises ValueError with the invalid xpath" x = self.sscls(text="") xpath = "//test[@foo='bar]" self.assertRaisesRegex(ValueError, re.escape(xpath), x.xpath, xpath) def test_invalid_xpath_unicode(self) -> None: "Test *Unicode* invalid xpath raises ValueError with the invalid xpath" x = self.sscls(text="") xpath = "//test[@foo='\\u0431ar]" self.assertRaisesRegex(ValueError, re.escape(xpath), x.xpath, xpath) def test_http_header_encoding_precedence(self) -> None: # '\xa3' = pound symbol in unicode # '\xc2\xa3' = pound symbol in utf-8 # '\xa3' = pound symbol in latin-1 (iso-8859-1) text = """ \xa3""" x = self.sscls(text=text) self.assertEqual( x.xpath("//span[@id='blank']/text()").extract(), ["\xa3"] ) def test_empty_bodies_shouldnt_raise_errors(self) -> None: self.sscls(text="").xpath("//text()").extract() def test_bodies_with_comments_only(self) -> None: sel = self.sscls( text="", base_url="http://example.com" ) self.assertEqual("http://example.com", sel.root.base) def test_null_bytes_shouldnt_raise_errors(self) -> None: text = "pre\x00post" self.sscls(text).xpath("//text()").extract() def test_replacement_char_from_badly_encoded_body(self) -> None: # \xe9 alone isn't valid utf8 sequence text = "

an Jos\\ufffd de

" self.assertEqual( ["an Jos\\ufffd de"], self.sscls(text).xpath("//text()").extract() ) def test_select_on_unevaluable_nodes(self) -> None: r = self.sscls(text='some text') # Text node x1 = r.xpath("//text()") self.assertEqual(x1.extract(), ["some text"]) self.assertEqual(x1.xpath(".//b").extract(), []) # Tag attribute x1 = r.xpath("//span/@class") self.assertEqual(x1.extract(), ["big"]) self.assertEqual(x1.xpath(".//text()").extract(), []) def test_select_on_text_nodes(self) -> None: r = self.sscls( text="

Options:opt1

Otheropt2

" ) x1 = r.xpath( "//div/descendant::text()[preceding-sibling::b[contains(text(), 'Options')]]" ) self.assertEqual(x1.extract(), ["opt1"]) x1 = r.xpath( "//div/descendant::text()/preceding-sibling::b[contains(text(), 'Options')]" ) self.assertEqual(x1.extract(), ["Options:"]) @unittest.skip("Text nodes lost parent node reference in lxml") def test_nested_select_on_text_nodes(self) -> None: # FIXME: does not work with lxml backend [upstream] r = self.sscls( text="

Options:opt1

Otheropt2

" ) x1 = r.xpath("//div/descendant::text()") x2 = x1.xpath("./preceding-sibling::b[contains(text(), 'Options')]") self.assertEqual(x2.extract(), ["Options:"]) def test_weakref_slots(self) -> None: """Check that classes are using slots and are weak-referenceable""" x = self.sscls(text="") weakref.ref(x) assert not hasattr( x, "__dict__" ), f"{x.__class__.__name__} does not use __slots__" def test_remove_namespaces(self) -> None: xml = """ """ sel = self.sscls(text=xml, type="xml") self.assertEqual(len(sel.xpath("//link")), 0) self.assertEqual(len(sel.xpath("./namespace::*")), 3) sel.remove_namespaces() self.assertEqual(len(sel.xpath("//link")), 3) self.assertEqual(len(sel.xpath("./namespace::*")), 1) def test_remove_namespaces_embedded(self) -> None: xml = """ """ sel = self.sscls(text=xml, type="xml") self.assertEqual(len(sel.xpath("//link")), 0) self.assertEqual(len(sel.xpath("//stop")), 0) self.assertEqual(len(sel.xpath("./namespace::*")), 2) self.assertEqual( len( sel.xpath( "//f:link", namespaces={"f": "http://www.w3.org/2005/Atom"}, ) ), 2, ) self.assertEqual( len( sel.xpath( "//s:stop", namespaces={"s": "http://www.w3.org/2000/svg"} ) ), 2, ) sel.remove_namespaces() self.assertEqual(len(sel.xpath("//link")), 2) self.assertEqual(len(sel.xpath("//stop")), 2) self.assertEqual(len(sel.xpath("./namespace::*")), 1) def test_remove_attributes_namespaces(self) -> None: xml = """ """ sel = self.sscls(text=xml, type="xml") self.assertEqual(len(sel.xpath("//link/@type")), 0) sel.remove_namespaces() self.assertEqual(len(sel.xpath("//link/@type")), 3) def test_smart_strings(self) -> None: """Lxml smart strings return values""" class SmartStringsSelector(Selector): _lxml_smart_strings = True body = """

four
five
six

""" # .getparent() is available for text nodes and attributes # only when smart_strings are on x = self.sscls(text=body) li_text = x.xpath("//li/text()") self.assertFalse(any([hasattr(e.root, "getparent") for e in li_text])) div_class = x.xpath("//div/@class") self.assertFalse( any([hasattr(e.root, "getparent") for e in div_class]) ) smart_x = SmartStringsSelector(text=body) smart_li_text = smart_x.xpath("//li/text()") self.assertTrue( all([hasattr(e.root, "getparent") for e in smart_li_text]) ) smart_div_class = smart_x.xpath("//div/@class") self.assertTrue( all([hasattr(e.root, "getparent") for e in smart_div_class]) ) def test_xml_entity_expansion(self) -> None: malicious_xml = ( '' " ]>&xxe;' ) sel = self.sscls(text=malicious_xml, type="xml") self.assertEqual(sel.extract(), "&xxe;") def test_configure_base_url(self) -> None: sel = self.sscls(text="nothing", base_url="http://example.com") self.assertEqual("http://example.com", sel.root.base) def test_extending_selector(self) -> None: class MySelectorList(SelectorList["MySelector"]): pass class MySelector(Selector): selectorlist_cls = MySelectorList def extra_method(self) -> str: return "extra" + cast(str, self.get()) sel = MySelector(text="

foo

") self.assertIsInstance(sel.xpath("//div"), MySelectorList) self.assertIsInstance(sel.xpath("//div")[0], MySelector) self.assertIsInstance(sel.css("div"), MySelectorList) self.assertIsInstance(sel.css("div")[0], MySelector) content: str = sel.css("div")[0].extra_method() self.assertEqual("extra

foo

", content) def test_replacement_null_char_from_body(self) -> None: text = "\x00

Grainy

" self.assertEqual( "

Grainy

", self.sscls(text).extract(), ) def test_remove_selector_list(self) -> None: sel = self.sscls( text="

" ) sel_list = sel.css("li") sel_list.drop() self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li"), []) def test_remove_selector(self) -> None: sel = self.sscls( text="

" ) sel_list = sel.css("li") sel_list[0].drop() self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li::text").getall(), ["2", "3"]) def test_remove_pseudo_element_selector_list(self) -> None: sel = self.sscls( text="

" ) sel_list = sel.css("li::text") self.assertEqual(sel_list.getall(), ["1", "2", "3"]) with self.assertRaises(CannotRemoveElementWithoutRoot): sel_list.drop() self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li::text").getall(), ["1", "2", "3"]) def test_remove_pseudo_element_selector(self) -> None: sel = self.sscls( text="

" ) sel_list = sel.css("li::text") self.assertEqual(sel_list.getall(), ["1", "2", "3"]) with self.assertRaises(CannotRemoveElementWithoutRoot): sel_list[0].drop() self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li::text").getall(), ["1", "2", "3"]) def test_remove_root_element_selector(self) -> None: sel = self.sscls( text="

" ) sel_list = sel.css("li::text") self.assertEqual(sel_list.getall(), ["1", "2", "3"]) with self.assertRaises(CannotRemoveElementWithoutParent): sel.drop() with self.assertRaises(CannotRemoveElementWithoutParent): sel.css("html").drop() self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li::text").getall(), ["1", "2", "3"]) sel.css("body").drop() self.assertEqual(sel.get(), "") def test_deep_nesting(self) -> None: lxml_version = parse_version(etree.__version__) lxml_huge_tree_version = parse_version("4.2") content = """ hello world

some test

""" # If lxml doesn't support huge trees expect wrong results and a warning if lxml_version < lxml_huge_tree_version: with warnings.catch_warnings(record=True) as w: sel = Selector(text=content) self.assertIn("huge_tree", str(w[0].message)) self.assertLessEqual(len(sel.css("span")), 256) self.assertEqual(len(sel.css("td")), 0) return # Same goes for explicitly disabling huge trees with warnings.catch_warnings(record=True) as w: sel = Selector(text=content, huge_tree=False) self.assertIn("huge_tree", str(w[0].message)) self.assertLessEqual(len(sel.css("span")), 256) self.assertEqual(len(sel.css("td")), 0) # If huge trees are enabled, elements with a depth > 255 should be found sel = Selector(text=content) nest_level = 282 self.assertEqual(len(sel.css("span")), nest_level) self.assertEqual(len(sel.css("td")), 1) def test_invalid_type(self) -> None: with self.assertRaises(ValueError): self.sscls("", type="xhtml") def test_default_type(self) -> None: text = "foo" selector = self.sscls(text) self.assertEqual(selector.type, "html") def test_json_type(self) -> None: obj = 1 selector = self.sscls(str(obj), type="json") self.assertEqual(selector.root, obj) self.assertEqual(selector.type, "json") def test_html_root(self) -> None: root = etree.fromstring("") selector = self.sscls(root=root) self.assertEqual(selector.root, root) self.assertEqual(selector.type, "html") def test_json_root(self) -> None: obj = 1 selector = self.sscls(root=obj) self.assertEqual(selector.root, obj) self.assertEqual(selector.type, "json") def test_json_xpath(self) -> None: obj = 1 selector = self.sscls(root=obj) with self.assertRaises(ValueError): selector.xpath("//*") def test_json_css(self) -> None: obj = 1 selector = self.sscls(root=obj) with self.assertRaises(ValueError): selector.css("*") def test_invalid_json(self) -> None: text = "" selector = self.sscls(text, type="json") self.assertEqual(selector.root, None) self.assertEqual(selector.type, "json") def test_text_and_root_warning(self) -> None: with warnings.catch_warnings(record=True) as w: Selector(text="a", root="b") self.assertIn("both text and root", str(w[0].message)) def test_etree_root_invalid_type(self) -> None: selector = Selector("") self.assertRaisesRegex( ValueError, "object as root", Selector, root=selector.root, type="text", ) self.assertRaisesRegex( ValueError, "object as root", Selector, root=selector.root, type="json", ) class ExsltTestCase(unittest.TestCase): sscls = Selector def test_regexp(self) -> None: """EXSLT regular expression tests""" body = """

first link second link EXSLT match example

""" sel = self.sscls(text=body) # re:test() self.assertEqual( sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]').extract(), [ x.extract() for x in sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]') ], ) self.assertEqual( [ x.extract() for x in sel.xpath(r'//a[re:test(@href, "\.html$")]/text()') ], ["first link", "second link"], ) self.assertEqual( [ x.extract() for x in sel.xpath('//a[re:test(@href, "first")]/text()') ], ["first link"], ) self.assertEqual( [ x.extract() for x in sel.xpath('//a[re:test(@href, "second")]/text()') ], ["second link"], ) # re:match() is rather special: it returns a node-set of nodes # ['http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml', # 'http', # 'www.bayes.co.uk', # '', # '/xml/index.xml?/xml/utils/rechecker.xml'] self.assertEqual( sel.xpath( r're:match(//a[re:test(@href, "\.xml$")]/@href,' r'"(\w+):\/\/([^/:]+)(:\d*)?([^# ]*)")/text()' ).extract(), [ "http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml", "http", "www.bayes.co.uk", "", "/xml/index.xml?/xml/utils/rechecker.xml", ], ) # re:replace() self.assertEqual( sel.xpath( r're:replace(//a[re:test(@href, "\.xml$")]/@href,' r'"(\w+)://(.+)(\.xml)", "","https://\2.html")' ).extract(), [ "https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html" ], ) def test_set(self) -> None: """EXSLT set manipulation tests""" # microdata example from http://schema.org/Event body = """

NBA Eastern Conference First Round Playoff Tickets: Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1) Thu, 04/21/16 8:00 p.m.

Wells Fargo Center

Philadelphia, PA

Priced from: $35 1938 tickets left

""" sel = self.sscls(text=body) self.assertEqual( sel.xpath( """//div[@itemtype="http://schema.org/Event"] //@itemprop""" ).extract(), [ "url", "name", "startDate", "location", "url", "address", "addressLocality", "addressRegion", "offers", "lowPrice", "offerCount", ], ) self.assertEqual( sel.xpath( """ set:difference(//div[@itemtype="http://schema.org/Event"] //@itemprop, //div[@itemtype="http://schema.org/Event"] //*[@itemscope]/*/@itemprop)""" ).extract(), ["url", "name", "startDate", "location", "offers"], ) def test_dont_remove_text_after_deleted_element(self) -> None: sel = self.sscls( text="""Text before.Text in. Text after. """ ) sel.css("span").drop() self.assertEqual( sel.get(), "Text before. Text after." ) def test_drop_with_xml_type(self) -> None: sel = self.sscls(text="", type="xml") el = sel.xpath("//b")[0] assert el.root.getparent() is not None el.drop() assert sel.get() == "" class SelectorBytesInput(Selector): def __init__( self, text: Optional[str] = None, type: Optional[str] = None, body: bytes = b"", encoding: str = "utf8", namespaces: Optional[Mapping[str, str]] = None, root: Optional[Any] = _NOT_SET, base_url: Optional[str] = None, _expr: Optional[str] = None, huge_tree: bool = LXML_SUPPORTS_HUGE_TREE, ) -> None: if text: body = bytes(text, encoding=encoding) text = None super().__init__( text=text, type=type, body=body, encoding=encoding, namespaces=namespaces, root=root, base_url=base_url, _expr=_expr, huge_tree=huge_tree, ) class SelectorTestCaseBytes(SelectorTestCase): sscls = SelectorBytesInput def test_representation_slice(self) -> None: pass def test_representation_unicode_query(self) -> None: pass def test_weakref_slots(self) -> None: pass def test_check_text_argument_type(self) -> None: self.assertRaisesRegex( TypeError, "body argument should be of type", self.sscls, body="", ) class ExsltTestCaseBytes(ExsltTestCase): sscls = SelectorBytesInput