Charles Azam commited on
Commit
0868311
·
1 Parent(s): abd6d4e

feat: rename pdf to markdown in agent

Browse files
src/deepengineer/deepsearch/analyse_markdown_agent.py CHANGED
@@ -1,5 +1,5 @@
1
  from smolagents import CodeAgent, tool, Tool, LiteLLMModel
2
- from deepengineer.webcrawler.pdf_utils import get_markdown_by_page_numbers, get_table_of_contents_per_page_pdf, find_in_pdf, convert_ocr_response_to_markdown
3
  from mistralai import OCRResponse
4
  from enum import Enum
5
  from pathlib import Path
@@ -8,7 +8,7 @@ class ToolNames(Enum):
8
  GET_TABLE_OF_CONTENTS = "get_table_of_contents"
9
  GET_MARKDOWN = "get_markdown"
10
  GET_PAGES_CONTENT = "get_pages_content"
11
- FIND_IN_PDF = "find_in_pdf"
12
 
13
  class GetTableOfContentsTool(Tool):
14
  name = ToolNames.GET_TABLE_OF_CONTENTS.value
@@ -19,7 +19,7 @@ class GetTableOfContentsTool(Tool):
19
  def __init__(self, markdown: OCRResponse):
20
  super().__init__()
21
  self.markdown: OCRResponse = markdown
22
- self.table_of_contents: str = get_table_of_contents_per_page_pdf(self.markdown)
23
 
24
  def forward(self) -> str:
25
  return self.table_of_contents
@@ -57,8 +57,8 @@ class GetPagesContentTool(Tool):
57
  def forward(self, page_numbers: list[int]) -> str:
58
  return get_markdown_by_page_numbers(self.markdown, page_numbers)
59
 
60
- class FindInPdfTool(Tool):
61
- name = ToolNames.FIND_IN_PDF.value
62
  description = f"Finds the page numbers of the document that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the document that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages."
63
  inputs = {
64
  "search_queries": {
@@ -73,7 +73,7 @@ class FindInPdfTool(Tool):
73
  self.markdown: OCRResponse = markdown
74
 
75
  def forward(self, search_queries: list[str]) -> list[int]:
76
- return find_in_pdf(self.markdown, search_queries)
77
 
78
 
79
 
@@ -81,21 +81,21 @@ def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
81
 
82
  model = LiteLLMModel(model_id=model_id)
83
 
84
- PDFS_TOOLS = [
85
  GetTableOfContentsTool(markdown),
86
  GetMarkdownTool(markdown),
87
  GetPagesContentTool(markdown),
88
- FindInPdfTool(markdown),
89
  ]
90
- pdf_agent = CodeAgent(
91
  model=model,
92
- tools=PDFS_TOOLS,
93
  max_steps=20,
94
  verbosity_level=2,
95
  planning_interval=4,
96
- name="pdf_agent",
97
- description="""A team member that will search the internet to answer your question.""",
98
  )
99
- pdf_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files."""
100
 
101
- return pdf_agent
 
1
  from smolagents import CodeAgent, tool, Tool, LiteLLMModel
2
+ from deepengineer.webcrawler.pdf_utils import get_markdown_by_page_numbers, get_table_of_contents_per_page_markdown, find_in_markdown, convert_ocr_response_to_markdown
3
  from mistralai import OCRResponse
4
  from enum import Enum
5
  from pathlib import Path
 
8
  GET_TABLE_OF_CONTENTS = "get_table_of_contents"
9
  GET_MARKDOWN = "get_markdown"
10
  GET_PAGES_CONTENT = "get_pages_content"
11
+ FIND_IN_MARKDOWN = "find_in_markdown"
12
 
13
  class GetTableOfContentsTool(Tool):
14
  name = ToolNames.GET_TABLE_OF_CONTENTS.value
 
19
  def __init__(self, markdown: OCRResponse):
20
  super().__init__()
21
  self.markdown: OCRResponse = markdown
22
+ self.table_of_contents: str = get_table_of_contents_per_page_markdown(self.markdown)
23
 
24
  def forward(self) -> str:
25
  return self.table_of_contents
 
57
  def forward(self, page_numbers: list[int]) -> str:
58
  return get_markdown_by_page_numbers(self.markdown, page_numbers)
59
 
60
+ class FindInMarkdownTool(Tool):
61
+ name = ToolNames.FIND_IN_MARKDOWN.value
62
  description = f"Finds the page numbers of the document that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the document that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages."
63
  inputs = {
64
  "search_queries": {
 
73
  self.markdown: OCRResponse = markdown
74
 
75
  def forward(self, search_queries: list[str]) -> list[int]:
76
+ return find_in_markdown(self.markdown, search_queries)
77
 
78
 
79
 
 
81
 
82
  model = LiteLLMModel(model_id=model_id)
83
 
84
+ MARKDOWN_TOOLS = [
85
  GetTableOfContentsTool(markdown),
86
  GetMarkdownTool(markdown),
87
  GetPagesContentTool(markdown),
88
+ FindInMarkdownTool(markdown),
89
  ]
90
+ markdown_agent = CodeAgent(
91
  model=model,
92
+ tools=MARKDOWN_TOOLS,
93
  max_steps=20,
94
  verbosity_level=2,
95
  planning_interval=4,
96
+ name="markdown_agent",
97
+ description="""A team member that can analyse a markdown.""",
98
  )
99
+ markdown_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files."""
100
 
101
+ return markdown_agent
src/deepengineer/webcrawler/pdf_utils.py CHANGED
@@ -62,7 +62,7 @@ def get_markdown_by_page_numbers(markdown: OCRResponse, page_numbers: list[int],
62
  markdowns.append(f"*Page {page_number}*\n{markdown.pages[page_number].markdown}")
63
  return "\n\n".join(markdowns)
64
 
65
- def find_in_pdf(markdown: OCRResponse, search_queries: list[str]) -> list[int]:
66
  """
67
  Find the page numbers of the pdf that contain the search query.
68
 
@@ -80,7 +80,7 @@ def find_in_pdf(markdown: OCRResponse, search_queries: list[str]) -> list[int]:
80
  page_numbers.append(page_number)
81
  return page_numbers
82
 
83
- def get_table_of_contents_per_page_pdf(markdown: OCRResponse) -> str:
84
  """
85
  Get the table of contents of the pdf.
86
 
 
62
  markdowns.append(f"*Page {page_number}*\n{markdown.pages[page_number].markdown}")
63
  return "\n\n".join(markdowns)
64
 
65
+ def find_in_markdown(markdown: OCRResponse, search_queries: list[str]) -> list[int]:
66
  """
67
  Find the page numbers of the pdf that contain the search query.
68
 
 
80
  page_numbers.append(page_number)
81
  return page_numbers
82
 
83
+ def get_table_of_contents_per_page_markdown(markdown: OCRResponse) -> str:
84
  """
85
  Get the table of contents of the pdf.
86
 
src/deepengineer/webcrawler/tools.py CHANGED
@@ -1,6 +1,6 @@
1
  from deepengineer.webcrawler.async_search import linkup_search_async, tavily_search_async, arxiv_search_async, pubmed_search_async, sciencedirect_search_async, scientific_search_async
2
  from deepengineer.webcrawler.async_crawl import crawl4ai_extract_markdown_of_url_async, arxiv_download_pdf_async, download_pdf_async
3
- from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_pdf
4
  from typing import Callable
5
  from smolagents.tools import get_json_schema
6
 
@@ -14,4 +14,4 @@ def print_function_signature_smolagents(tool_function: Callable):
14
  print("inputs: ", tool_json_schema["parameters"]["properties"])
15
  print("output_type: ", tool_json_schema["return"]["type"])
16
 
17
- print_function_signature_smolagents(get_table_of_contents_per_page_pdf)
 
1
  from deepengineer.webcrawler.async_search import linkup_search_async, tavily_search_async, arxiv_search_async, pubmed_search_async, sciencedirect_search_async, scientific_search_async
2
  from deepengineer.webcrawler.async_crawl import crawl4ai_extract_markdown_of_url_async, arxiv_download_pdf_async, download_pdf_async
3
+ from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown
4
  from typing import Callable
5
  from smolagents.tools import get_json_schema
6
 
 
14
  print("inputs: ", tool_json_schema["parameters"]["properties"])
15
  print("output_type: ", tool_json_schema["return"]["type"])
16
 
17
+ print_function_signature_smolagents(get_table_of_contents_per_page_markdown)
tests/webcrawler/test_pdfs_utils.py CHANGED
@@ -1,4 +1,4 @@
1
- from deepengineer.webcrawler.pdf_utils import convert_pdf_to_markdown_async, convert_ocr_response_to_markdown, find_in_pdf, get_table_of_contents_per_page_pdf, get_markdown_by_page_numbers
2
  from mistralai import OCRResponse
3
  from deepengineer.common_path import DATA_DIR
4
  import pytest
@@ -22,12 +22,12 @@ async def test_convert_pdf_to_markdown_async():
22
 
23
  def test_table_of_contents_per_page_pdf():
24
  ocr_response = load_mock_ocr_response()
25
- table_of_contents = get_table_of_contents_per_page_pdf(ocr_response)
26
  assert "References - Page 15" in table_of_contents
27
 
28
  def test_find_in_pdf():
29
  ocr_response = load_mock_ocr_response()
30
- page_numbers = find_in_pdf(ocr_response, "where each cylinder represent")
31
  assert page_numbers == [7]
32
 
33
  def test_get_markdown_by_page_numbers():
 
1
+ from deepengineer.webcrawler.pdf_utils import convert_pdf_to_markdown_async, convert_ocr_response_to_markdown, find_in_markdown, get_table_of_contents_per_page_markdown, get_markdown_by_page_numbers
2
  from mistralai import OCRResponse
3
  from deepengineer.common_path import DATA_DIR
4
  import pytest
 
22
 
23
  def test_table_of_contents_per_page_pdf():
24
  ocr_response = load_mock_ocr_response()
25
+ table_of_contents = get_table_of_contents_per_page_markdown(ocr_response)
26
  assert "References - Page 15" in table_of_contents
27
 
28
  def test_find_in_pdf():
29
  ocr_response = load_mock_ocr_response()
30
+ page_numbers = find_in_markdown(ocr_response, "where each cylinder represent")
31
  assert page_numbers == [7]
32
 
33
  def test_get_markdown_by_page_numbers():