Spaces:
Runtime error
Runtime error
"""This file contains test cases reported by third parties using | |
fuzzing tools, primarily from Google's oss-fuzz project. Some of these | |
represent real problems with Beautiful Soup, but many are problems in | |
libraries that Beautiful Soup depends on, and many of the test cases | |
represent different ways of triggering the same problem. | |
Grouping these test cases together makes it easy to see which test | |
cases represent the same problem, and puts the test cases in close | |
proximity to code that can trigger the problems. | |
""" | |
import os | |
import pytest | |
from bs4 import ( | |
BeautifulSoup, | |
ParserRejectedMarkup, | |
) | |
class TestFuzz(object): | |
# Test case markup files from fuzzers are given this extension so | |
# they can be included in builds. | |
TESTCASE_SUFFIX = ".testcase" | |
# This class of error has been fixed by catching a less helpful | |
# exception from html.parser and raising ParserRejectedMarkup | |
# instead. | |
def test_rejected_markup(self, filename): | |
markup = self.__markup(filename) | |
with pytest.raises(ParserRejectedMarkup): | |
BeautifulSoup(markup, 'html.parser') | |
# This class of error has to do with very deeply nested documents | |
# which overflow the Python call stack when the tree is converted | |
# to a string. This is an issue with Beautiful Soup which was fixed | |
# as part of [bug=1471755]. | |
def test_deeply_nested_document(self, filename): | |
# Parsing the document and encoding it back to a string is | |
# sufficient to demonstrate that the overflow problem has | |
# been fixed. | |
markup = self.__markup(filename) | |
BeautifulSoup(markup, 'html.parser').encode() | |
# This class of error represents problems with html5lib's parser, | |
# not Beautiful Soup. I use | |
# https://github.com/html5lib/html5lib-python/issues/568 to notify | |
# the html5lib developers of these issues. | |
def test_html5lib_parse_errors(self, filename): | |
markup = self.__markup(filename) | |
print(BeautifulSoup(markup, 'html5lib').encode()) | |
def __markup(self, filename): | |
if not filename.endswith(self.TESTCASE_SUFFIX): | |
filename += self.TESTCASE_SUFFIX | |
this_dir = os.path.split(__file__)[0] | |
path = os.path.join(this_dir, 'fuzz', filename) | |
return open(path, 'rb').read() | |