Spaces:
Sleeping
Sleeping
File size: 1,635 Bytes
e040f4f 4fbfc2b e040f4f 4fbfc2b e040f4f b5fafa1 e040f4f b5fafa1 4fbfc2b e040f4f b5fafa1 e040f4f b5fafa1 4fbfc2b ce79b68 b5fafa1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
from deepengineer.webcrawler.crawl_database import DataBase
import pytest
@pytest.mark.expensive
def test_crawl_database_arxiv_pdf():
db = DataBase()
db.crawl_url("https://arxiv.org/pdf/2105.00643")
assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
assert (
db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages[0].markdown
is not None
)
assert len(db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages) == 20
@pytest.mark.expensive
def test_crawl_database_arxiv_link():
db = DataBase()
db.crawl_url("https://arxiv.org/abs/2105.00643")
assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
assert (
db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages[0].markdown
is not None
)
assert len(db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages) == 20
@pytest.mark.expensive
def test_crawl_database_wikipedia_url():
db = DataBase()
db.crawl_url("https://en.wikipedia.org/wiki/Deep_learning")
assert (
db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning")
is not None
)
assert (
db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning")
.pages[0]
.markdown
is not None
)
assert (
len(db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning").pages)
>= 40
)
|