deepdraft / tests /webcrawler /test_pdfs_utils.py
Charles Azam
clean: run linting and formating on repo
b5fafa1
import pytest
from deepengineer.common_path import DATA_DIR
from deepengineer.webcrawler.pdf_utils import (
convert_ocr_response_to_markdown,
convert_pdf_to_markdown_async,
find_in_markdown,
get_markdown_by_page_numbers,
get_table_of_contents_per_page_markdown,
)
from mistralai import OCRResponse
def load_mock_ocr_response() -> OCRResponse:
with open(DATA_DIR / "report_thermal_neutron.json") as f:
return OCRResponse.model_validate_json(f.read())
@pytest.mark.expensive
@pytest.mark.asyncio
async def test_convert_pdf_to_markdown_async():
pdf_path = DATA_DIR / "report_thermal_neutron.pdf"
assert pdf_path.exists()
ocr_response = await convert_pdf_to_markdown_async(pdf_path)
markdown = convert_ocr_response_to_markdown(ocr_response)
assert isinstance(ocr_response, OCRResponse)
assert len(ocr_response.pages) == 16
assert "where each cylinder represent" in markdown
def test_table_of_contents_per_page_pdf():
ocr_response = load_mock_ocr_response()
table_of_contents = get_table_of_contents_per_page_markdown(ocr_response)
assert "References - Page 15" in table_of_contents
def test_find_in_pdf():
ocr_response = load_mock_ocr_response()
page_numbers = find_in_markdown(ocr_response, "where each cylinder represent")
assert page_numbers == [7]
def test_get_markdown_by_page_numbers():
ocr_response = load_mock_ocr_response()
page_numbers = [7, 15]
markdown = get_markdown_by_page_numbers(ocr_response, page_numbers)
assert "Page 7" in markdown
assert "Page 15" in markdown
assert "References" in markdown
assert "where each cylinder represent" in markdown