Spaces:
Sleeping
Sleeping
File size: 4,543 Bytes
8b32433 151c2dd 8b32433 151c2dd 8b32433 151c2dd 8b32433 151c2dd 8b32433 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import unittest
from bs4 import BeautifulSoup
import beautiful_soup
class BeautifulSoupTest(unittest.TestCase):
def setUp(self):
self.html = '''
<html>
<head></head>
<body>
<main>
<div>
<ul>
<li><a href="https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans">Electronic Billing</a></li>
<li><a href="https://www.cms.gov/Medicare/Billing/BillingFAQs">Billing FAQs</a></li>
</ul>
</div>
<div>
<div>
<p>Paragraph</p>
<ul>
<li>List Item</li>
</ul>
Text within div
</div>
</div>
</main>
</body>
</html>
'''
def test_main_tag(self):
soup = BeautifulSoup( self.html, 'html.parser' )
self.assertEqual( beautiful_soup.get_main( soup ).name, 'main' )
soup = BeautifulSoup( "", 'html.parser' )
self.assertEqual( beautiful_soup.get_main( soup ).name, 'main' )
def test_has_no_div_childre(self):
childless = '''
<html>
<body>
<div><p>Text in div.</p></div>
</body>
</html>
'''
soup = BeautifulSoup( childless, 'html.parser' )
# self.assertFalse( beautiful_soup.has_no_div_children( soup.body ) )
# self.assertTrue( beautiful_soup.has_no_div_children( soup.body.div ) )
nested_div = '''
<html>
<body>
<div>
<div>Text in paragraph.</div>
</div>
</body>
</html>
'''
soup = BeautifulSoup( nested_div, 'html.parser' )
# self.assertFalse( beautiful_soup.has_no_div_children( soup.body.div ) )
def test_get_deepest_divs(self):
nested_div = '''
<html>
<body>
<div>
<div><p>Text in paragraph.</p></div>
</div>
</body>
</html>
'''
soup = BeautifulSoup( nested_div, 'html.parser' )
self.assertEqual( beautiful_soup.get_deepest_divs( soup.body )[0].text, 'Text in paragraph.' )
def test_list(self):
nested_div = '''
<html>
<body>
<div>
<ul>
<li>Text in list.</li>
<li><a href"">Link in list.</a></li>
<li>Text with <a href"">Link</a> in list.</li>
</ul>
</div>
</body>
</html>
'''
soup = BeautifulSoup( nested_div, 'html.parser' )
divs = beautiful_soup.get_deepest_divs( soup.body )
# self.assertEqual( beautiful_soup.get_list_text( divs )[0], 'Text in list.' )
def test_exlcude_links(self):
nested_div = '''
<li><a href='somelink'>I DONT WANT THIS</a></li>
<li>blablalba <a href='both'>I WANT THIS</a> blalba</li>
<li><a href='right'>I WANT THIS</a> blalba</li>
<li>blablalba <a href='left'>I WANT THIS</a></li>
<p><a href='somelink'>I WANT THIS</a></p>
<p>blablalba <a href='both'>I WANT THIS</a> blalba</p>
<p><a href='right'>I WANT THIS</a> blalba</p>
<p>blablalba <a href='left'>I WANT THIS</a></p>
'''
soup = BeautifulSoup( nested_div, 'html.parser' )
list_items = soup.find_all(beautiful_soup.find_direct_text)
results = [
'blablalba I WANT THIS blalba',
'I WANT THIS blalba',
'blablalba I WANT THIS',
'I WANT THIS',
'blablalba I WANT THIS blalba',
'I WANT THIS blalba',
'blablalba I WANT THIS'
]
print(list_items)
# for item in list_items:
# print('item.get_text(): ' + item.get_text())
# help(list_items)
for i, item in enumerate(list_items):
self.assertEqual( item.get_text(), results[i] )
# self.assertEqual( list_items[0].get_text(), 'blablalba I WANT THIS blalba' )
# self.assertEqual( list_items[1].get_text(), 'I WANT THI Sblalba' )
# self.assertEqual( list_items[2].get_text(), 'blablalba I WANT THIS' )
if __name__ == '__main__':
unittest.main()
|