File size: 1,682 Bytes
84c66cd
b5fafa1
 
 
 
 
 
 
 
 
 
84c66cd
f0e5174
b5fafa1
f0e5174
 
84c66cd
 
 
 
 
 
f0e5174
 
84c66cd
 
 
b5fafa1
f0e5174
 
 
0868311
f0e5174
 
b5fafa1
f0e5174
 
0868311
f0e5174
 
b5fafa1
f0e5174
 
 
 
 
 
 
b5fafa1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pytest
from deepengineer.common_path import DATA_DIR
from deepengineer.webcrawler.pdf_utils import (
    convert_ocr_response_to_markdown,
    convert_pdf_to_markdown_async,
    find_in_markdown,
    get_markdown_by_page_numbers,
    get_table_of_contents_per_page_markdown,
)
from mistralai import OCRResponse


def load_mock_ocr_response() -> OCRResponse:
    with open(DATA_DIR / "report_thermal_neutron.json") as f:
        return OCRResponse.model_validate_json(f.read())


@pytest.mark.expensive
@pytest.mark.asyncio
async def test_convert_pdf_to_markdown_async():
    pdf_path = DATA_DIR / "report_thermal_neutron.pdf"
    assert pdf_path.exists()
    ocr_response = await convert_pdf_to_markdown_async(pdf_path)
    markdown = convert_ocr_response_to_markdown(ocr_response)
    assert isinstance(ocr_response, OCRResponse)
    assert len(ocr_response.pages) == 16
    assert "where each cylinder represent" in markdown


def test_table_of_contents_per_page_pdf():
    ocr_response = load_mock_ocr_response()
    table_of_contents = get_table_of_contents_per_page_markdown(ocr_response)
    assert "References - Page 15" in table_of_contents


def test_find_in_pdf():
    ocr_response = load_mock_ocr_response()
    page_numbers = find_in_markdown(ocr_response, "where each cylinder represent")
    assert page_numbers == [7]


def test_get_markdown_by_page_numbers():
    ocr_response = load_mock_ocr_response()
    page_numbers = [7, 15]
    markdown = get_markdown_by_page_numbers(ocr_response, page_numbers)
    assert "Page 7" in markdown
    assert "Page 15" in markdown
    assert "References" in markdown
    assert "where each cylinder represent" in markdown