Spaces:
Sleeping
Sleeping
File size: 1,682 Bytes
84c66cd b5fafa1 84c66cd f0e5174 b5fafa1 f0e5174 84c66cd f0e5174 84c66cd b5fafa1 f0e5174 0868311 f0e5174 b5fafa1 f0e5174 0868311 f0e5174 b5fafa1 f0e5174 b5fafa1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import pytest
from deepengineer.common_path import DATA_DIR
from deepengineer.webcrawler.pdf_utils import (
convert_ocr_response_to_markdown,
convert_pdf_to_markdown_async,
find_in_markdown,
get_markdown_by_page_numbers,
get_table_of_contents_per_page_markdown,
)
from mistralai import OCRResponse
def load_mock_ocr_response() -> OCRResponse:
with open(DATA_DIR / "report_thermal_neutron.json") as f:
return OCRResponse.model_validate_json(f.read())
@pytest.mark.expensive
@pytest.mark.asyncio
async def test_convert_pdf_to_markdown_async():
pdf_path = DATA_DIR / "report_thermal_neutron.pdf"
assert pdf_path.exists()
ocr_response = await convert_pdf_to_markdown_async(pdf_path)
markdown = convert_ocr_response_to_markdown(ocr_response)
assert isinstance(ocr_response, OCRResponse)
assert len(ocr_response.pages) == 16
assert "where each cylinder represent" in markdown
def test_table_of_contents_per_page_pdf():
ocr_response = load_mock_ocr_response()
table_of_contents = get_table_of_contents_per_page_markdown(ocr_response)
assert "References - Page 15" in table_of_contents
def test_find_in_pdf():
ocr_response = load_mock_ocr_response()
page_numbers = find_in_markdown(ocr_response, "where each cylinder represent")
assert page_numbers == [7]
def test_get_markdown_by_page_numbers():
ocr_response = load_mock_ocr_response()
page_numbers = [7, 15]
markdown = get_markdown_by_page_numbers(ocr_response, page_numbers)
assert "Page 7" in markdown
assert "Page 15" in markdown
assert "References" in markdown
assert "where each cylinder represent" in markdown
|