Spaces:
Sleeping
Sleeping
Charles Azam
commited on
Commit
·
0868311
1
Parent(s):
abd6d4e
feat: rename pdf to markdown in agent
Browse files
src/deepengineer/deepsearch/analyse_markdown_agent.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from smolagents import CodeAgent, tool, Tool, LiteLLMModel
|
2 |
-
from deepengineer.webcrawler.pdf_utils import get_markdown_by_page_numbers,
|
3 |
from mistralai import OCRResponse
|
4 |
from enum import Enum
|
5 |
from pathlib import Path
|
@@ -8,7 +8,7 @@ class ToolNames(Enum):
|
|
8 |
GET_TABLE_OF_CONTENTS = "get_table_of_contents"
|
9 |
GET_MARKDOWN = "get_markdown"
|
10 |
GET_PAGES_CONTENT = "get_pages_content"
|
11 |
-
|
12 |
|
13 |
class GetTableOfContentsTool(Tool):
|
14 |
name = ToolNames.GET_TABLE_OF_CONTENTS.value
|
@@ -19,7 +19,7 @@ class GetTableOfContentsTool(Tool):
|
|
19 |
def __init__(self, markdown: OCRResponse):
|
20 |
super().__init__()
|
21 |
self.markdown: OCRResponse = markdown
|
22 |
-
self.table_of_contents: str =
|
23 |
|
24 |
def forward(self) -> str:
|
25 |
return self.table_of_contents
|
@@ -57,8 +57,8 @@ class GetPagesContentTool(Tool):
|
|
57 |
def forward(self, page_numbers: list[int]) -> str:
|
58 |
return get_markdown_by_page_numbers(self.markdown, page_numbers)
|
59 |
|
60 |
-
class
|
61 |
-
name = ToolNames.
|
62 |
description = f"Finds the page numbers of the document that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the document that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages."
|
63 |
inputs = {
|
64 |
"search_queries": {
|
@@ -73,7 +73,7 @@ class FindInPdfTool(Tool):
|
|
73 |
self.markdown: OCRResponse = markdown
|
74 |
|
75 |
def forward(self, search_queries: list[str]) -> list[int]:
|
76 |
-
return
|
77 |
|
78 |
|
79 |
|
@@ -81,21 +81,21 @@ def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
|
|
81 |
|
82 |
model = LiteLLMModel(model_id=model_id)
|
83 |
|
84 |
-
|
85 |
GetTableOfContentsTool(markdown),
|
86 |
GetMarkdownTool(markdown),
|
87 |
GetPagesContentTool(markdown),
|
88 |
-
|
89 |
]
|
90 |
-
|
91 |
model=model,
|
92 |
-
tools=
|
93 |
max_steps=20,
|
94 |
verbosity_level=2,
|
95 |
planning_interval=4,
|
96 |
-
name="
|
97 |
-
description="""A team member that
|
98 |
)
|
99 |
-
|
100 |
|
101 |
-
return
|
|
|
1 |
from smolagents import CodeAgent, tool, Tool, LiteLLMModel
|
2 |
+
from deepengineer.webcrawler.pdf_utils import get_markdown_by_page_numbers, get_table_of_contents_per_page_markdown, find_in_markdown, convert_ocr_response_to_markdown
|
3 |
from mistralai import OCRResponse
|
4 |
from enum import Enum
|
5 |
from pathlib import Path
|
|
|
8 |
GET_TABLE_OF_CONTENTS = "get_table_of_contents"
|
9 |
GET_MARKDOWN = "get_markdown"
|
10 |
GET_PAGES_CONTENT = "get_pages_content"
|
11 |
+
FIND_IN_MARKDOWN = "find_in_markdown"
|
12 |
|
13 |
class GetTableOfContentsTool(Tool):
|
14 |
name = ToolNames.GET_TABLE_OF_CONTENTS.value
|
|
|
19 |
def __init__(self, markdown: OCRResponse):
|
20 |
super().__init__()
|
21 |
self.markdown: OCRResponse = markdown
|
22 |
+
self.table_of_contents: str = get_table_of_contents_per_page_markdown(self.markdown)
|
23 |
|
24 |
def forward(self) -> str:
|
25 |
return self.table_of_contents
|
|
|
57 |
def forward(self, page_numbers: list[int]) -> str:
|
58 |
return get_markdown_by_page_numbers(self.markdown, page_numbers)
|
59 |
|
60 |
+
class FindInMarkdownTool(Tool):
|
61 |
+
name = ToolNames.FIND_IN_MARKDOWN.value
|
62 |
description = f"Finds the page numbers of the document that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the document that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages."
|
63 |
inputs = {
|
64 |
"search_queries": {
|
|
|
73 |
self.markdown: OCRResponse = markdown
|
74 |
|
75 |
def forward(self, search_queries: list[str]) -> list[int]:
|
76 |
+
return find_in_markdown(self.markdown, search_queries)
|
77 |
|
78 |
|
79 |
|
|
|
81 |
|
82 |
model = LiteLLMModel(model_id=model_id)
|
83 |
|
84 |
+
MARKDOWN_TOOLS = [
|
85 |
GetTableOfContentsTool(markdown),
|
86 |
GetMarkdownTool(markdown),
|
87 |
GetPagesContentTool(markdown),
|
88 |
+
FindInMarkdownTool(markdown),
|
89 |
]
|
90 |
+
markdown_agent = CodeAgent(
|
91 |
model=model,
|
92 |
+
tools=MARKDOWN_TOOLS,
|
93 |
max_steps=20,
|
94 |
verbosity_level=2,
|
95 |
planning_interval=4,
|
96 |
+
name="markdown_agent",
|
97 |
+
description="""A team member that can analyse a markdown.""",
|
98 |
)
|
99 |
+
markdown_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files."""
|
100 |
|
101 |
+
return markdown_agent
|
src/deepengineer/webcrawler/pdf_utils.py
CHANGED
@@ -62,7 +62,7 @@ def get_markdown_by_page_numbers(markdown: OCRResponse, page_numbers: list[int],
|
|
62 |
markdowns.append(f"*Page {page_number}*\n{markdown.pages[page_number].markdown}")
|
63 |
return "\n\n".join(markdowns)
|
64 |
|
65 |
-
def
|
66 |
"""
|
67 |
Find the page numbers of the pdf that contain the search query.
|
68 |
|
@@ -80,7 +80,7 @@ def find_in_pdf(markdown: OCRResponse, search_queries: list[str]) -> list[int]:
|
|
80 |
page_numbers.append(page_number)
|
81 |
return page_numbers
|
82 |
|
83 |
-
def
|
84 |
"""
|
85 |
Get the table of contents of the pdf.
|
86 |
|
|
|
62 |
markdowns.append(f"*Page {page_number}*\n{markdown.pages[page_number].markdown}")
|
63 |
return "\n\n".join(markdowns)
|
64 |
|
65 |
+
def find_in_markdown(markdown: OCRResponse, search_queries: list[str]) -> list[int]:
|
66 |
"""
|
67 |
Find the page numbers of the pdf that contain the search query.
|
68 |
|
|
|
80 |
page_numbers.append(page_number)
|
81 |
return page_numbers
|
82 |
|
83 |
+
def get_table_of_contents_per_page_markdown(markdown: OCRResponse) -> str:
|
84 |
"""
|
85 |
Get the table of contents of the pdf.
|
86 |
|
src/deepengineer/webcrawler/tools.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from deepengineer.webcrawler.async_search import linkup_search_async, tavily_search_async, arxiv_search_async, pubmed_search_async, sciencedirect_search_async, scientific_search_async
|
2 |
from deepengineer.webcrawler.async_crawl import crawl4ai_extract_markdown_of_url_async, arxiv_download_pdf_async, download_pdf_async
|
3 |
-
from deepengineer.webcrawler.pdf_utils import
|
4 |
from typing import Callable
|
5 |
from smolagents.tools import get_json_schema
|
6 |
|
@@ -14,4 +14,4 @@ def print_function_signature_smolagents(tool_function: Callable):
|
|
14 |
print("inputs: ", tool_json_schema["parameters"]["properties"])
|
15 |
print("output_type: ", tool_json_schema["return"]["type"])
|
16 |
|
17 |
-
print_function_signature_smolagents(
|
|
|
1 |
from deepengineer.webcrawler.async_search import linkup_search_async, tavily_search_async, arxiv_search_async, pubmed_search_async, sciencedirect_search_async, scientific_search_async
|
2 |
from deepengineer.webcrawler.async_crawl import crawl4ai_extract_markdown_of_url_async, arxiv_download_pdf_async, download_pdf_async
|
3 |
+
from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown
|
4 |
from typing import Callable
|
5 |
from smolagents.tools import get_json_schema
|
6 |
|
|
|
14 |
print("inputs: ", tool_json_schema["parameters"]["properties"])
|
15 |
print("output_type: ", tool_json_schema["return"]["type"])
|
16 |
|
17 |
+
print_function_signature_smolagents(get_table_of_contents_per_page_markdown)
|
tests/webcrawler/test_pdfs_utils.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from deepengineer.webcrawler.pdf_utils import convert_pdf_to_markdown_async, convert_ocr_response_to_markdown,
|
2 |
from mistralai import OCRResponse
|
3 |
from deepengineer.common_path import DATA_DIR
|
4 |
import pytest
|
@@ -22,12 +22,12 @@ async def test_convert_pdf_to_markdown_async():
|
|
22 |
|
23 |
def test_table_of_contents_per_page_pdf():
|
24 |
ocr_response = load_mock_ocr_response()
|
25 |
-
table_of_contents =
|
26 |
assert "References - Page 15" in table_of_contents
|
27 |
|
28 |
def test_find_in_pdf():
|
29 |
ocr_response = load_mock_ocr_response()
|
30 |
-
page_numbers =
|
31 |
assert page_numbers == [7]
|
32 |
|
33 |
def test_get_markdown_by_page_numbers():
|
|
|
1 |
+
from deepengineer.webcrawler.pdf_utils import convert_pdf_to_markdown_async, convert_ocr_response_to_markdown, find_in_markdown, get_table_of_contents_per_page_markdown, get_markdown_by_page_numbers
|
2 |
from mistralai import OCRResponse
|
3 |
from deepengineer.common_path import DATA_DIR
|
4 |
import pytest
|
|
|
22 |
|
23 |
def test_table_of_contents_per_page_pdf():
|
24 |
ocr_response = load_mock_ocr_response()
|
25 |
+
table_of_contents = get_table_of_contents_per_page_markdown(ocr_response)
|
26 |
assert "References - Page 15" in table_of_contents
|
27 |
|
28 |
def test_find_in_pdf():
|
29 |
ocr_response = load_mock_ocr_response()
|
30 |
+
page_numbers = find_in_markdown(ocr_response, "where each cylinder represent")
|
31 |
assert page_numbers == [7]
|
32 |
|
33 |
def test_get_markdown_by_page_numbers():
|