Spaces:
Runtime error
Runtime error
File size: 6,256 Bytes
35b22df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
"""Tests to ensure that the html.parser tree builder generates good
trees."""
from pdb import set_trace
import pickle
import pytest
import warnings
from bs4.builder import (
HTMLParserTreeBuilder,
ParserRejectedMarkup,
XMLParsedAsHTMLWarning,
)
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
from . import SoupTest, HTMLTreeBuilderSmokeTest
class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
default_builder = HTMLParserTreeBuilder
def test_rejected_input(self):
# Python's html.parser will occasionally reject markup,
# especially when there is a problem with the initial DOCTYPE
# declaration. Different versions of Python sound the alarm in
# different ways, but Beautiful Soup consistently raises
# errors as ParserRejectedMarkup exceptions.
bad_markup = [
# https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
# https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
# https://github.com/python/cpython/issues/81928
b'\n<![\xff\xfe\xfe\xcd\x00',
#https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
# https://github.com/python/cpython/issues/78661
#
b'<![n\x00',
b"<![UNKNOWN[]]>",
]
for markup in bad_markup:
with pytest.raises(ParserRejectedMarkup):
soup = self.soup(markup)
def test_namespaced_system_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
def test_namespaced_public_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
def test_builder_is_pickled(self):
"""Unlike most tree builders, HTMLParserTreeBuilder and will
be restored after pickling.
"""
tree = self.soup("<a><b>foo</a>")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
assert isinstance(loaded.builder, type(tree.builder))
def test_redundant_empty_element_closing_tags(self):
self.assert_soup('<br></br><br></br><br></br>', "<br/><br/><br/>")
self.assert_soup('</br></br></br>', "")
def test_empty_element(self):
# This verifies that any buffered data present when the parser
# finishes working is handled.
self.assert_soup("foo &# bar", "foo &# bar")
def test_tracking_line_numbers(self):
# The html.parser TreeBuilder keeps track of line number and
# position of each element.
markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"
soup = self.soup(markup)
assert 2 == soup.p.sourceline
assert 3 == soup.p.sourcepos
assert "sourceline" == soup.p.find('sourceline').name
# You can deactivate this behavior.
soup = self.soup(markup, store_line_numbers=False)
assert "sourceline" == soup.p.sourceline.name
assert "sourcepos" == soup.p.sourcepos.name
def test_on_duplicate_attribute(self):
# The html.parser tree builder has a variety of ways of
# handling a tag that contains the same attribute multiple times.
markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">'
# If you don't provide any particular value for
# on_duplicate_attribute, later values replace earlier values.
soup = self.soup(markup)
assert "url3" == soup.a['href']
assert ["cls"] == soup.a['class']
assert "id" == soup.a['id']
# You can also get this behavior explicitly.
def assert_attribute(on_duplicate_attribute, expected):
soup = self.soup(
markup, on_duplicate_attribute=on_duplicate_attribute
)
assert expected == soup.a['href']
# Verify that non-duplicate attributes are treated normally.
assert ["cls"] == soup.a['class']
assert "id" == soup.a['id']
assert_attribute(None, "url3")
assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
# You can ignore subsequent values in favor of the first.
assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1")
# And you can pass in a callable that does whatever you want.
def accumulate(attrs, key, value):
if not isinstance(attrs[key], list):
attrs[key] = [attrs[key]]
attrs[key].append(value)
assert_attribute(accumulate, ["url1", "url2", "url3"])
def test_html5_attributes(self):
# The html.parser TreeBuilder can convert any entity named in
# the HTML5 spec to a sequence of Unicode characters, and
# convert those Unicode characters to a (potentially
# different) named entity on the way out.
for input_element, output_unicode, output_element in (
("⇄", '\u21c4', b'⇄'),
('⊧', '\u22a7', b'⊧'),
('𝔑', '\U0001d511', b'𝔑'),
('≧̸', '\u2267\u0338', b'≧̸'),
('¬', '\xac', b'¬'),
('⫬', '\u2aec', b'⫬'),
('"', '"', b'"'),
('∴', '\u2234', b'∴'),
('∴', '\u2234', b'∴'),
('∴', '\u2234', b'∴'),
("fj", 'fj', b'fj'),
("⊔", '\u2294', b'⊔'),
("⊔︀", '\u2294\ufe00', b'⊔︀'),
("'", "'", b"'"),
("|", "|", b"|"),
):
markup = '<div>%s</div>' % input_element
div = self.soup(markup).div
without_element = div.encode()
expect = b"<div>%s</div>" % output_unicode.encode("utf8")
assert without_element == expect
with_element = div.encode(formatter="html")
expect = b"<div>%s</div>" % output_element
assert with_element == expect
|