Charles Azam commited on
Commit
f0e5174
·
1 Parent(s): 84c66cd

feat: add new tools for pdfs along with tests

Browse files
data/report_thermal_neutron.json ADDED
The diff for this file is too large to render. See raw diff
 
src/deepengineer/webcrawler/pdf_tools.py CHANGED
@@ -19,7 +19,7 @@ MAX_SIZE_BYTES = 49 * 1024 * 1024
19
  async def convert_pdf_to_markdown_async(
20
  pdf_path: Path,
21
  with_image_description: bool = False,
22
- ) -> tuple[str, OCRResponse]:
23
 
24
  mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
25
 
@@ -39,92 +39,85 @@ async def convert_pdf_to_markdown_async(
39
  include_image_base64=True,
40
  )
41
  print(f"Processing PDF: {pdf_path.name}")
42
- return (
43
- _get_combined_markdown(
44
- ocr_response=ocr_response, with_image_description=with_image_description
45
- ),
46
- ocr_response,
47
- )
48
 
49
 
50
- def _get_image_description_using_llm(
51
- base_64_str: str, model: str = "mistral/mistral-small-latest"
52
- ) -> str | None:
53
- assert base_64_str.startswith("data:image/jpeg;base64")
54
-
55
- messages = [
56
- {
57
- "role": "user",
58
- "content": [
59
- {"type": "text", "text": "Describe this image in detail:"},
60
- {"type": "image_url", "image_url": {"url": base_64_str}},
61
- ],
62
- }
63
- ]
64
- try:
65
- response = completion(
66
- model=model, # LiteLLM naming convention
67
- messages=messages,
68
- temperature=0.0,
69
- stream=False,
70
- )
71
- output = dict(response)["choices"][0].message.content
72
- except BadRequestError:
73
- output = ""
74
- return output
75
-
76
-
77
- def _replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
78
- """
79
- Replace image placeholders in markdown with base64-encoded images.
80
 
81
- Args:
82
- markdown_str: Markdown text containing image placeholders
83
- images_dict: Dictionary mapping image IDs to base64 strings
 
 
84
 
85
- Returns:
86
- Markdown text with images replaced by base64 data
87
  """
88
- for img_name, base64_str in images_dict.items():
89
- print(f"Processing image: {img_name}")
90
- try:
91
- image_description = _get_image_description_using_llm(base_64_str=base64_str)
92
- except RetryError:
93
- image_description = "Image not found"
94
- formatted_description = f"""> [Image {img_name} Replaced with Description Below]
95
- > {image_description.replace('\n', '\n> ')}
96
- """
97
- markdown_str = markdown_str.replace(
98
- f"![{img_name}]({img_name})", formatted_description
99
- )
100
- return markdown_str
101
-
102
-
103
- def _get_combined_markdown(
104
- ocr_response: OCRResponse, with_image_description: bool
105
- ) -> str:
106
- """
107
- Combine OCR text and images into a single markdown document.
108
 
109
  Args:
110
- ocr_response: Response from OCR processing containing text and images
 
111
 
112
  Returns:
113
- Combined markdown string with embedded images
114
  """
 
 
 
 
 
115
 
116
- markdowns: list[str] = []
117
- # Extract images from page
118
- for page in ocr_response.pages:
119
- # Replace image placeholders with actual images
120
- if with_image_description:
121
- image_data = {}
122
- for img in page.images:
123
- image_data[img.id] = img.image_base64
124
- page_description = _replace_images_in_markdown(page.markdown, image_data)
125
- else:
126
- page_description = page.markdown
127
- markdowns.append(page_description)
128
-
129
- return "\n\n".join(markdowns)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
 
19
  async def convert_pdf_to_markdown_async(
20
  pdf_path: Path,
21
  with_image_description: bool = False,
22
+ ) -> tuple[OCRResponse]:
23
 
24
  mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
25
 
 
39
  include_image_base64=True,
40
  )
41
  print(f"Processing PDF: {pdf_path.name}")
42
+ return ocr_response
 
 
 
 
 
43
 
44
 
45
+ def convert_ocr_response_to_markdown(
46
+ ocr_response: OCRResponse
47
+ ) -> str:
48
+ markdowns: list[str] = []
49
+ for page in ocr_response.pages:
50
+ page_description = page.markdown
51
+ markdowns.append(page_description)
52
+
53
+ return "\n\n".join(markdowns)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ def get_markdown_by_page_numbers(markdown: OCRResponse, page_numbers: list[int]) -> str:
56
+ markdowns: list[str] = []
57
+ for page_number in page_numbers:
58
+ markdowns.append(f"*Page {page_number}*\n{markdown.pages[page_number].markdown}")
59
+ return "\n\n".join(markdowns)
60
 
61
+ def find_in_pdf(markdown: OCRResponse, search_query: str) -> list[int]:
 
62
  """
63
+ Find the page numbers of the pdf that contain the search query.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  Args:
66
+ markdown (OCRResponse): The markdown of the pdf.
67
+ search_query (str): The search query.
68
 
69
  Returns:
70
+ list[int]: The page numbers of the pdf that contain the search query.
71
  """
72
+ page_numbers: list[int] = []
73
+ for page_number, page in enumerate(markdown.pages):
74
+ if search_query.lower() in page.markdown.lower():
75
+ page_numbers.append(page_number)
76
+ return page_numbers
77
 
78
+ def table_of_contents_per_page_pdf(markdown: OCRResponse) -> str:
79
+ """
80
+ Get the table of contents of the pdf.
81
+
82
+ Finds all the titles of the pdf to reconstruct the table of contents.
83
+ """
84
+ title_to_page_number: dict[str, int] = {}
85
+ for page_number, page in enumerate(markdown.pages):
86
+ lines = page.markdown.split("\n")
87
+ for line in lines:
88
+ line = line.strip()
89
+ if line.startswith("#"):
90
+ title_to_page_number[line] = page_number
91
+
92
+ table_of_contents = "\n".join([f"{title} - Page {page_number}" for title, page_number in title_to_page_number.items()])
93
+ return table_of_contents
94
+
95
+ def get_images_from_pdf(pdf_path: Path, image_ids: list[str]) -> list[str]:
96
+ raise NotImplementedError("Not implemented")
97
+
98
+ def get_image_description_using_llm(
99
+ base_64_str: str, model: str = "mistral/mistral-small-latest"
100
+ ) -> str | None:
101
+ assert base_64_str.startswith("data:image/jpeg;base64")
102
+
103
+ messages = [
104
+ {
105
+ "role": "user",
106
+ "content": [
107
+ {"type": "text", "text": "Describe this image in detail:"},
108
+ {"type": "image_url", "image_url": {"url": base_64_str}},
109
+ ],
110
+ }
111
+ ]
112
+ try:
113
+ response = completion(
114
+ model=model, # LiteLLM naming convention
115
+ messages=messages,
116
+ temperature=0.0,
117
+ stream=False,
118
+ )
119
+ output = dict(response)["choices"][0].message.content
120
+ except BadRequestError:
121
+ output = ""
122
+ return output
123
 
tests/webcrawler/test_async_search.py CHANGED
@@ -5,8 +5,10 @@ from deepengineer.webcrawler.async_search import (
5
  SearchResponse,
6
  get_tavily_usage,
7
  linkup_search_async,
8
- get_linkup_balance
 
9
  )
 
10
 
11
 
12
  @pytest.mark.expensive
@@ -34,7 +36,8 @@ async def test_tavily_search_async():
34
  assert response.search_results[0].title is not None
35
  assert response.search_results[0].url is not None
36
  assert response.search_results[0].content is not None
37
- assert any(result.raw_content is not None for result in response.search_results)
 
38
 
39
  usage_after = get_tavily_usage()
40
  print(usage_after)
@@ -65,4 +68,32 @@ async def test_linkup_search_async():
65
 
66
  balance_after = get_linkup_balance()
67
  print(balance_after)
68
- assert balance_after == balance_before - 0.005
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  SearchResponse,
6
  get_tavily_usage,
7
  linkup_search_async,
8
+ get_linkup_balance,
9
+ arxiv_search_async
10
  )
11
+ import numpy as np
12
 
13
 
14
  @pytest.mark.expensive
 
36
  assert response.search_results[0].title is not None
37
  assert response.search_results[0].url is not None
38
  assert response.search_results[0].content is not None
39
+ # raw content is often not available for tavily
40
+ # assert any(result.raw_content is not None for result in response.search_results)
41
 
42
  usage_after = get_tavily_usage()
43
  print(usage_after)
 
68
 
69
  balance_after = get_linkup_balance()
70
  print(balance_after)
71
+ assert np.isclose(balance_after, balance_before - 0.005)
72
+
73
+ @pytest.mark.expensive
74
+ @pytest.mark.asyncio
75
+ async def test_arxiv_search_async():
76
+ balance_before = get_linkup_balance()
77
+
78
+ response = await arxiv_search_async(
79
+ search_query="Would it be possible to make a thermal reactor with graphite and lead?",
80
+ )
81
+
82
+ assert response is not None
83
+ assert isinstance(response, SearchResponse)
84
+ assert response.query is not None
85
+ assert response.answer is not None
86
+ assert response.search_results is not None
87
+ assert len(response.search_results) >= 10
88
+ assert any(result.url.startswith("https://arxiv.org/abs/") for result in response.search_results)
89
+
90
+ balance_after = get_linkup_balance()
91
+ assert np.isclose(balance_after, balance_before - 0.005)
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
tests/webcrawler/test_pdfs_tools.py CHANGED
@@ -1,16 +1,40 @@
1
- from deepengineer.webcrawler.pdf_tools import convert_pdf_to_markdown_async
2
  from mistralai import OCRResponse
3
  from deepengineer.common_path import DATA_DIR
4
  import pytest
5
 
 
 
 
 
6
 
7
  @pytest.mark.expensive
8
  @pytest.mark.asyncio
9
  async def test_convert_pdf_to_markdown_async():
10
  pdf_path = DATA_DIR / "report_thermal_neutron.pdf"
11
  assert pdf_path.exists()
12
- markdown, ocr_response = await convert_pdf_to_markdown_async(pdf_path)
 
13
  assert isinstance(ocr_response, OCRResponse)
14
  assert len(ocr_response.pages) == 16
15
- assert isinstance(markdown, str)
16
  assert "where each cylinder represent" in markdown
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from deepengineer.webcrawler.pdf_tools import convert_pdf_to_markdown_async, convert_ocr_response_to_markdown, find_in_pdf, table_of_contents_per_page_pdf, get_markdown_by_page_numbers
2
  from mistralai import OCRResponse
3
  from deepengineer.common_path import DATA_DIR
4
  import pytest
5
 
6
+ def load_mock_ocr_response() -> OCRResponse:
7
+ with open(DATA_DIR / "report_thermal_neutron.json", "r") as f:
8
+ return OCRResponse.model_validate_json(f.read())
9
+
10
 
11
  @pytest.mark.expensive
12
  @pytest.mark.asyncio
13
  async def test_convert_pdf_to_markdown_async():
14
  pdf_path = DATA_DIR / "report_thermal_neutron.pdf"
15
  assert pdf_path.exists()
16
+ ocr_response = await convert_pdf_to_markdown_async(pdf_path)
17
+ markdown = convert_ocr_response_to_markdown(ocr_response)
18
  assert isinstance(ocr_response, OCRResponse)
19
  assert len(ocr_response.pages) == 16
 
20
  assert "where each cylinder represent" in markdown
21
+
22
+
23
+ def test_table_of_contents_per_page_pdf():
24
+ ocr_response = load_mock_ocr_response()
25
+ table_of_contents = table_of_contents_per_page_pdf(ocr_response)
26
+ assert "References - Page 15" in table_of_contents
27
+
28
+ def test_find_in_pdf():
29
+ ocr_response = load_mock_ocr_response()
30
+ page_numbers = find_in_pdf(ocr_response, "where each cylinder represent")
31
+ assert page_numbers == [7]
32
+
33
+ def test_get_markdown_by_page_numbers():
34
+ ocr_response = load_mock_ocr_response()
35
+ page_numbers = [7, 15]
36
+ markdown = get_markdown_by_page_numbers(ocr_response, page_numbers)
37
+ assert "Page 7" in markdown
38
+ assert "Page 15" in markdown
39
+ assert "References" in markdown
40
+ assert "where each cylinder represent" in markdown