Spaces:
Runtime error
Runtime error
Charles Azam
commited on
Commit
Β·
f1368c4
1
Parent(s):
b1060b0
feat: add crawling functions
Browse files- .vscode/settings.json +7 -0
- data/{linkup_response.json β answers/linkup_response.json} +0 -0
- data/{tavily_response.json β answers/tavily_response.json} +0 -0
- pyproject.toml +4 -0
- src/deepengineer/common_path.py +11 -0
- src/deepengineer/webcrawler/async_crawl.py +31 -13
- src/deepengineer/webcrawler/async_search.py +30 -22
- src/deepengineer/webcrawler/pdf_tools.py +130 -0
- src/deepengineer/webcrawler/testing.py +20 -0
- tests/webcrawler/test_async_crawl.py +44 -0
- tests/webcrawler/{test_utils.py β test_async_search.py} +9 -10
- uv.lock +54 -0
.vscode/settings.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"python.testing.pytestArgs": [
|
| 3 |
+
"tests"
|
| 4 |
+
],
|
| 5 |
+
"python.testing.unittestEnabled": false,
|
| 6 |
+
"python.testing.pytestEnabled": true
|
| 7 |
+
}
|
data/{linkup_response.json β answers/linkup_response.json}
RENAMED
|
File without changes
|
data/{tavily_response.json β answers/tavily_response.json}
RENAMED
|
File without changes
|
pyproject.toml
CHANGED
|
@@ -21,6 +21,10 @@ dependencies = [
|
|
| 21 |
"gradio",
|
| 22 |
"open-deep-research",
|
| 23 |
"python-dotenv>=1.1.1",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
]
|
| 25 |
|
| 26 |
[project.scripts]
|
|
|
|
| 21 |
"gradio",
|
| 22 |
"open-deep-research",
|
| 23 |
"python-dotenv>=1.1.1",
|
| 24 |
+
"httpx",
|
| 25 |
+
"pypdf",
|
| 26 |
+
"pytest-asyncio>=1.0.0",
|
| 27 |
+
"mistralai>=1.9.1",
|
| 28 |
]
|
| 29 |
|
| 30 |
[project.scripts]
|
src/deepengineer/common_path.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
DEEPENGINEER_CODE_DIR = Path(__file__).parent
|
| 4 |
+
DEEPENGINEER_SRC_DIR = DEEPENGINEER_CODE_DIR.parent
|
| 5 |
+
DEEPENGINEER_ROOT_DIR = DEEPENGINEER_SRC_DIR.parent
|
| 6 |
+
|
| 7 |
+
assert DEEPENGINEER_CODE_DIR.name == "deepengineer"
|
| 8 |
+
assert DEEPENGINEER_SRC_DIR.name == "src"
|
| 9 |
+
|
| 10 |
+
DATA_DIR = DEEPENGINEER_ROOT_DIR / "data"
|
| 11 |
+
assert DATA_DIR.exists()
|
src/deepengineer/webcrawler/async_crawl.py
CHANGED
|
@@ -1,15 +1,33 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
async def crawl4ai_extract_async():
|
| 8 |
-
pass
|
| 9 |
-
|
| 10 |
-
async def crawl4ai_crawl_async():
|
| 11 |
-
pass
|
| 12 |
-
|
| 13 |
-
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import aiofiles
|
| 2 |
+
import httpx
|
| 3 |
+
import crawl4ai
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
async def crawl4ai_extract_markdown_of_url_async(url: str) -> str:
|
| 8 |
+
"""Extract markdown content from a URL using crawl4ai."""
|
| 9 |
+
async with crawl4ai.AsyncWebCrawler() as crawler:
|
| 10 |
+
result = await crawler.arun(url=url)
|
| 11 |
+
return result.markdown
|
| 12 |
+
|
| 13 |
+
async def download_pdf_async(url: str, output_path: Path) -> str:
|
| 14 |
+
"""Download a PDF file from a URL."""
|
| 15 |
+
timeout = httpx.Timeout(30.0, connect=10.0)
|
| 16 |
+
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
|
| 17 |
+
response = await client.get(url)
|
| 18 |
+
response.raise_for_status()
|
| 19 |
+
async with aiofiles.open(output_path, "wb") as f:
|
| 20 |
+
await f.write(response.content)
|
| 21 |
+
return output_path
|
| 22 |
|
| 23 |
+
async def arxiv_download_pdf_async(url: str, output_path: Path) -> str:
|
| 24 |
+
"""Download a PDF from arXiv by converting the abstract URL to PDF URL."""
|
| 25 |
+
# Extract the arXiv ID from the URL
|
| 26 |
+
if "/abs/" in url:
|
| 27 |
+
arxiv_id = url.split("/abs/")[1].rstrip("/")
|
| 28 |
+
pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
|
| 29 |
+
else:
|
| 30 |
+
# If it's already a PDF URL, use it as is
|
| 31 |
+
pdf_url = url
|
| 32 |
+
|
| 33 |
+
return await download_pdf_async(pdf_url, output_path)
|
src/deepengineer/webcrawler/async_search.py
CHANGED
|
@@ -3,6 +3,7 @@ import asyncio
|
|
| 3 |
import requests
|
| 4 |
from pydantic import BaseModel, Field
|
| 5 |
from typing import List, Optional, Literal
|
|
|
|
| 6 |
|
| 7 |
from linkup import LinkupClient, LinkupSourcedAnswer
|
| 8 |
from tavily import AsyncTavilyClient
|
|
@@ -23,6 +24,11 @@ class SearchResponse(BaseModel):
|
|
| 23 |
answer: str | None = Field(None, description="Direct answer from the search API if available")
|
| 24 |
search_results: list[SearchResult] = Field(default_factory=list, description="List of search results")
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
def get_tavily_usage():
|
| 28 |
url = "https://api.tavily.com/usage"
|
|
@@ -39,7 +45,8 @@ async def tavily_search_async(
|
|
| 39 |
include_answer: Literal["basic", "advanced"] | None = "advanced",
|
| 40 |
include_raw_content: Literal["text", "markdown"] | None = "markdown",
|
| 41 |
include_images: bool = False,
|
| 42 |
-
search_depth: Literal['basic', 'advanced'] | None = "basic"
|
|
|
|
| 43 |
) -> SearchResponse:
|
| 44 |
"""
|
| 45 |
Performs concurrent web searches with the Tavily API
|
|
@@ -52,7 +59,8 @@ async def tavily_search_async(
|
|
| 52 |
include_answer=include_answer,
|
| 53 |
include_raw_content=include_raw_content,
|
| 54 |
max_results=max_results,
|
| 55 |
-
include_images=include_images
|
|
|
|
| 56 |
)
|
| 57 |
|
| 58 |
search_results = [
|
|
@@ -90,6 +98,7 @@ async def async_linkup_search(
|
|
| 90 |
depth: Literal["standard", "deep"] = "standard",
|
| 91 |
output_type: Literal['searchResults', 'sourcedAnswer', 'structured'] = "sourcedAnswer",
|
| 92 |
include_images: bool = False,
|
|
|
|
| 93 |
) -> SearchResponse:
|
| 94 |
"""
|
| 95 |
Performs concurrent web searches using the Linkup API.
|
|
@@ -100,11 +109,10 @@ async def async_linkup_search(
|
|
| 100 |
query=search_query,
|
| 101 |
depth=depth,
|
| 102 |
output_type=output_type,
|
| 103 |
-
include_images=include_images
|
|
|
|
| 104 |
)
|
| 105 |
|
| 106 |
-
|
| 107 |
-
|
| 108 |
search_results = [
|
| 109 |
SearchResult(
|
| 110 |
title=result.name,
|
|
@@ -126,28 +134,28 @@ async def async_linkup_search(
|
|
| 126 |
|
| 127 |
|
| 128 |
|
| 129 |
-
class ArxivSearchParams(BaseModel):
|
| 130 |
-
"""Parameters for arXiv search."""
|
| 131 |
-
load_max_docs: int = Field(default=5, ge=1, le=20, description="Maximum number of documents to return per query")
|
| 132 |
-
get_full_documents: bool = Field(default=True, description="Whether to fetch full text of documents")
|
| 133 |
-
load_all_available_meta: bool = Field(default=True, description="Whether to load all available metadata")
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
class PubMedSearchParams(BaseModel):
|
| 137 |
-
"""Parameters for PubMed search."""
|
| 138 |
-
top_k_results: int = Field(default=5, ge=1, le=20, description="Maximum number of documents to return per query")
|
| 139 |
-
email: Optional[str] = Field(None, description="Email address for PubMed API. Required by NCBI.")
|
| 140 |
-
api_key: Optional[str] = Field(None, description="API key for PubMed API for higher rate limits")
|
| 141 |
-
doc_content_chars_max: int = Field(default=4000, ge=100, le=10000, description="Maximum characters for document content")
|
| 142 |
-
|
| 143 |
|
| 144 |
async def arxiv_search_async(
|
| 145 |
search_query: str,
|
| 146 |
) -> SearchResponse:
|
| 147 |
-
|
|
|
|
| 148 |
|
| 149 |
|
| 150 |
async def pubmed_search_async(
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
) -> SearchResponse:
|
| 153 |
-
|
|
|
|
|
|
| 3 |
import requests
|
| 4 |
from pydantic import BaseModel, Field
|
| 5 |
from typing import List, Optional, Literal
|
| 6 |
+
from enum import Enum
|
| 7 |
|
| 8 |
from linkup import LinkupClient, LinkupSourcedAnswer
|
| 9 |
from tavily import AsyncTavilyClient
|
|
|
|
| 24 |
answer: str | None = Field(None, description="Direct answer from the search API if available")
|
| 25 |
search_results: list[SearchResult] = Field(default_factory=list, description="List of search results")
|
| 26 |
|
| 27 |
+
class ScientificDomains(str, Enum):
|
| 28 |
+
wikipedia = "wikipedia.org"
|
| 29 |
+
arxiv = "arxiv.org"
|
| 30 |
+
pubmed = "pubmed.ncbi.nlm.nih.gov"
|
| 31 |
+
sciencedirect = "sciencedirect.com"
|
| 32 |
|
| 33 |
def get_tavily_usage():
|
| 34 |
url = "https://api.tavily.com/usage"
|
|
|
|
| 45 |
include_answer: Literal["basic", "advanced"] | None = "advanced",
|
| 46 |
include_raw_content: Literal["text", "markdown"] | None = "markdown",
|
| 47 |
include_images: bool = False,
|
| 48 |
+
search_depth: Literal['basic', 'advanced'] | None = "basic",
|
| 49 |
+
include_domains: list[ScientificDomains] = None,
|
| 50 |
) -> SearchResponse:
|
| 51 |
"""
|
| 52 |
Performs concurrent web searches with the Tavily API
|
|
|
|
| 59 |
include_answer=include_answer,
|
| 60 |
include_raw_content=include_raw_content,
|
| 61 |
max_results=max_results,
|
| 62 |
+
include_images=include_images,
|
| 63 |
+
include_domains=include_domains,
|
| 64 |
)
|
| 65 |
|
| 66 |
search_results = [
|
|
|
|
| 98 |
depth: Literal["standard", "deep"] = "standard",
|
| 99 |
output_type: Literal['searchResults', 'sourcedAnswer', 'structured'] = "sourcedAnswer",
|
| 100 |
include_images: bool = False,
|
| 101 |
+
include_domains: list[ScientificDomains] = None,
|
| 102 |
) -> SearchResponse:
|
| 103 |
"""
|
| 104 |
Performs concurrent web searches using the Linkup API.
|
|
|
|
| 109 |
query=search_query,
|
| 110 |
depth=depth,
|
| 111 |
output_type=output_type,
|
| 112 |
+
include_images=include_images,
|
| 113 |
+
include_domains=include_domains,
|
| 114 |
)
|
| 115 |
|
|
|
|
|
|
|
| 116 |
search_results = [
|
| 117 |
SearchResult(
|
| 118 |
title=result.name,
|
|
|
|
| 134 |
|
| 135 |
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
async def arxiv_search_async(
|
| 139 |
search_query: str,
|
| 140 |
) -> SearchResponse:
|
| 141 |
+
response = await async_linkup_search(search_query, include_domains=[ScientificDomains.arxiv])
|
| 142 |
+
return response
|
| 143 |
|
| 144 |
|
| 145 |
async def pubmed_search_async(
|
| 146 |
+
search_query: str,
|
| 147 |
+
) -> SearchResponse:
|
| 148 |
+
response = await async_linkup_search(search_query, include_domains=[ScientificDomains.pubmed])
|
| 149 |
+
return response
|
| 150 |
+
|
| 151 |
+
async def sciencedirect_search_async(
|
| 152 |
+
search_query: str,
|
| 153 |
+
) -> SearchResponse:
|
| 154 |
+
response = await async_linkup_search(search_query, include_domains=[ScientificDomains.sciencedirect])
|
| 155 |
+
return response
|
| 156 |
+
|
| 157 |
+
async def scientific_search_async(
|
| 158 |
+
search_query: str,
|
| 159 |
) -> SearchResponse:
|
| 160 |
+
response = await async_linkup_search(search_query, include_domains=[ScientificDomains.wikipedia, ScientificDomains.arxiv, ScientificDomains.pubmed, ScientificDomains.sciencedirect])
|
| 161 |
+
return response
|
src/deepengineer/webcrawler/pdf_tools.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from pypdf import PdfReader, PdfWriter
|
| 4 |
+
import io
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from mistralai import Mistral
|
| 7 |
+
import os
|
| 8 |
+
from litellm import completion
|
| 9 |
+
|
| 10 |
+
from mistralai.models import OCRResponse
|
| 11 |
+
import yaml
|
| 12 |
+
from tenacity import retry, stop_after_attempt, wait_fixed, RetryError
|
| 13 |
+
from litellm.exceptions import BadRequestError
|
| 14 |
+
|
| 15 |
+
# Define the size limit in bytes
|
| 16 |
+
MAX_SIZE_BYTES = 49 * 1024 * 1024
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
async def convert_pdf_to_markdown_async(
|
| 20 |
+
pdf_path: Path,
|
| 21 |
+
with_image_description: bool = False,
|
| 22 |
+
) -> tuple[str, OCRResponse]:
|
| 23 |
+
|
| 24 |
+
mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
|
| 25 |
+
|
| 26 |
+
uploaded_pdf = await mistral_client.files.upload_async(
|
| 27 |
+
file={
|
| 28 |
+
"file_name": "uploaded_file.pdf",
|
| 29 |
+
"content": open(pdf_path, "rb"),
|
| 30 |
+
},
|
| 31 |
+
purpose="ocr",
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
signed_url = await mistral_client.files.get_signed_url_async(file_id=uploaded_pdf.id)
|
| 35 |
+
|
| 36 |
+
ocr_response = await mistral_client.ocr.process_async(
|
| 37 |
+
model="mistral-ocr-latest",
|
| 38 |
+
document={"type": "document_url", "document_url": signed_url.url},
|
| 39 |
+
include_image_base64=True,
|
| 40 |
+
)
|
| 41 |
+
print(f"Processing PDF: {pdf_path.name}")
|
| 42 |
+
return (
|
| 43 |
+
_get_combined_markdown(
|
| 44 |
+
ocr_response=ocr_response, with_image_description=with_image_description
|
| 45 |
+
),
|
| 46 |
+
ocr_response,
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _get_image_description_using_llm(
|
| 51 |
+
base_64_str: str, model: str = "mistral/mistral-small-latest"
|
| 52 |
+
) -> str | None:
|
| 53 |
+
assert base_64_str.startswith("data:image/jpeg;base64")
|
| 54 |
+
|
| 55 |
+
messages = [
|
| 56 |
+
{
|
| 57 |
+
"role": "user",
|
| 58 |
+
"content": [
|
| 59 |
+
{"type": "text", "text": "Describe this image in detail:"},
|
| 60 |
+
{"type": "image_url", "image_url": {"url": base_64_str}},
|
| 61 |
+
],
|
| 62 |
+
}
|
| 63 |
+
]
|
| 64 |
+
try:
|
| 65 |
+
response = completion(
|
| 66 |
+
model=model, # LiteLLM naming convention
|
| 67 |
+
messages=messages,
|
| 68 |
+
temperature=0.0,
|
| 69 |
+
stream=False,
|
| 70 |
+
)
|
| 71 |
+
output = dict(response)["choices"][0].message.content
|
| 72 |
+
except BadRequestError:
|
| 73 |
+
output = ""
|
| 74 |
+
return output
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
|
| 78 |
+
"""
|
| 79 |
+
Replace image placeholders in markdown with base64-encoded images.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
markdown_str: Markdown text containing image placeholders
|
| 83 |
+
images_dict: Dictionary mapping image IDs to base64 strings
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
Markdown text with images replaced by base64 data
|
| 87 |
+
"""
|
| 88 |
+
for img_name, base64_str in images_dict.items():
|
| 89 |
+
print(f"Processing image: {img_name}")
|
| 90 |
+
try:
|
| 91 |
+
image_description = _get_image_description_using_llm(base_64_str=base64_str)
|
| 92 |
+
except RetryError:
|
| 93 |
+
image_description = "Image not found"
|
| 94 |
+
formatted_description = f"""> [Image {img_name} Replaced with Description Below]
|
| 95 |
+
> {image_description.replace('\n', '\n> ')}
|
| 96 |
+
"""
|
| 97 |
+
markdown_str = markdown_str.replace(
|
| 98 |
+
f"", formatted_description
|
| 99 |
+
)
|
| 100 |
+
return markdown_str
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def _get_combined_markdown(
|
| 104 |
+
ocr_response: OCRResponse, with_image_description: bool
|
| 105 |
+
) -> str:
|
| 106 |
+
"""
|
| 107 |
+
Combine OCR text and images into a single markdown document.
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
ocr_response: Response from OCR processing containing text and images
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
Combined markdown string with embedded images
|
| 114 |
+
"""
|
| 115 |
+
|
| 116 |
+
markdowns: list[str] = []
|
| 117 |
+
# Extract images from page
|
| 118 |
+
for page in ocr_response.pages:
|
| 119 |
+
# Replace image placeholders with actual images
|
| 120 |
+
if with_image_description:
|
| 121 |
+
image_data = {}
|
| 122 |
+
for img in page.images:
|
| 123 |
+
image_data[img.id] = img.image_base64
|
| 124 |
+
page_description = _replace_images_in_markdown(page.markdown, image_data)
|
| 125 |
+
else:
|
| 126 |
+
page_description = page.markdown
|
| 127 |
+
markdowns.append(page_description)
|
| 128 |
+
|
| 129 |
+
return "\n\n".join(markdowns)
|
| 130 |
+
|
src/deepengineer/webcrawler/testing.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from deepengineer.common_path import DATA_DIR
|
| 2 |
+
from deepengineer.webcrawler.async_search import SearchResponse
|
| 3 |
+
|
| 4 |
+
LINKUP_RESPONSE_FILE = DATA_DIR / "answers" / "linkup_response.json"
|
| 5 |
+
TAVILY_RESPONSE_FILE = DATA_DIR / "answers" / "tavily_response.json"
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def load_linkup_response() -> SearchResponse:
|
| 9 |
+
with open(LINKUP_RESPONSE_FILE, "r") as f:
|
| 10 |
+
return SearchResponse.model_validate_json(f.read())
|
| 11 |
+
|
| 12 |
+
def load_tavily_response() -> SearchResponse:
|
| 13 |
+
with open(TAVILY_RESPONSE_FILE, "r") as f:
|
| 14 |
+
return SearchResponse.model_validate_json(f.read())
|
| 15 |
+
|
| 16 |
+
URL_WIKIPEDIA = "https://en.wikipedia.org/wiki/Graphite-moderated_reactor"
|
| 17 |
+
URL_PDF = "https://arxiv.org/pdf/1301.1699.pdf"
|
| 18 |
+
ARXIV_URL = "https://arxiv.org/abs/1301.1699"
|
| 19 |
+
PUBMED_URL = "https://pubmed.ncbi.nlm.nih.gov/34100000/"
|
| 20 |
+
SCIENCEDIRECT_URL = "https://www.sciencedirect.com/science/article/abs/pii/0168900289901964"
|
tests/webcrawler/test_async_crawl.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from deepengineer.webcrawler.async_crawl import (
|
| 3 |
+
crawl4ai_extract_markdown_of_url_async,
|
| 4 |
+
download_pdf_async,
|
| 5 |
+
arxiv_download_pdf_async,
|
| 6 |
+
)
|
| 7 |
+
from mistralai import OCRResponse
|
| 8 |
+
from deepengineer.webcrawler.pdf_tools import convert_pdf_to_markdown_async
|
| 9 |
+
from deepengineer.webcrawler.testing import URL_WIKIPEDIA, URL_PDF, ARXIV_URL
|
| 10 |
+
from deepengineer.common_path import DATA_DIR
|
| 11 |
+
|
| 12 |
+
@pytest.mark.asyncio
|
| 13 |
+
async def test_crawl4ai_extract_markdown_of_url_async():
|
| 14 |
+
markdown = await crawl4ai_extract_markdown_of_url_async(URL_WIKIPEDIA)
|
| 15 |
+
assert isinstance(markdown, str)
|
| 16 |
+
assert "Graphite-moderated reactor" in markdown
|
| 17 |
+
|
| 18 |
+
@pytest.mark.asyncio
|
| 19 |
+
async def test_download_pdf_async():
|
| 20 |
+
output_path = DATA_DIR / "temp.pdf"
|
| 21 |
+
output_path.unlink(missing_ok=True)
|
| 22 |
+
pdf_path = await download_pdf_async(URL_PDF, output_path=output_path)
|
| 23 |
+
assert pdf_path == output_path
|
| 24 |
+
assert output_path.exists()
|
| 25 |
+
|
| 26 |
+
@pytest.mark.asyncio
|
| 27 |
+
async def test_arxiv_download_pdf_async():
|
| 28 |
+
output_path = DATA_DIR / "temp.pdf"
|
| 29 |
+
output_path.unlink(missing_ok=True)
|
| 30 |
+
assert not output_path.exists()
|
| 31 |
+
pdf_path = await arxiv_download_pdf_async(ARXIV_URL, output_path=output_path)
|
| 32 |
+
assert pdf_path == output_path
|
| 33 |
+
assert output_path.exists()
|
| 34 |
+
|
| 35 |
+
@pytest.mark.expensive
|
| 36 |
+
@pytest.mark.asyncio
|
| 37 |
+
async def test_convert_pdf_to_markdown_async():
|
| 38 |
+
pdf_path = DATA_DIR / "report_thermal_neutron.pdf"
|
| 39 |
+
assert pdf_path.exists()
|
| 40 |
+
markdown, ocr_response = await convert_pdf_to_markdown_async(pdf_path)
|
| 41 |
+
assert isinstance(ocr_response, OCRResponse)
|
| 42 |
+
assert len(ocr_response.pages) == 16
|
| 43 |
+
assert isinstance(markdown, str)
|
| 44 |
+
assert "where each cylinder represent" in markdown
|
tests/webcrawler/{test_utils.py β test_async_search.py}
RENAMED
|
@@ -10,17 +10,17 @@ from deepengineer.webcrawler.async_search import (
|
|
| 10 |
|
| 11 |
|
| 12 |
@pytest.mark.expensive
|
| 13 |
-
|
|
|
|
| 14 |
|
| 15 |
usage_before = get_tavily_usage()
|
| 16 |
print(usage_before)
|
| 17 |
|
| 18 |
|
| 19 |
-
response =
|
| 20 |
-
|
| 21 |
-
search_query="Would it be possible to make a thermal reactor with graphite and lead?",
|
| 22 |
-
)
|
| 23 |
)
|
|
|
|
| 24 |
print(response.answer)
|
| 25 |
assert response is not None
|
| 26 |
assert isinstance(response, SearchResponse)
|
|
@@ -41,15 +41,14 @@ def test_tavily_search_async():
|
|
| 41 |
assert usage_after == usage_before + 1
|
| 42 |
|
| 43 |
@pytest.mark.expensive
|
| 44 |
-
|
|
|
|
| 45 |
|
| 46 |
balance_before = get_linkup_balance()
|
| 47 |
print(balance_before)
|
| 48 |
|
| 49 |
-
response =
|
| 50 |
-
|
| 51 |
-
search_query="Would it be possible to make a thermal reactor with graphite and lead?",
|
| 52 |
-
)
|
| 53 |
)
|
| 54 |
print(response.answer)
|
| 55 |
assert response is not None
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
@pytest.mark.expensive
|
| 13 |
+
@pytest.mark.asyncio
|
| 14 |
+
async def test_tavily_search_async():
|
| 15 |
|
| 16 |
usage_before = get_tavily_usage()
|
| 17 |
print(usage_before)
|
| 18 |
|
| 19 |
|
| 20 |
+
response = await tavily_search_async(
|
| 21 |
+
search_query="Would it be possible to make a thermal reactor with graphite and lead?",
|
|
|
|
|
|
|
| 22 |
)
|
| 23 |
+
|
| 24 |
print(response.answer)
|
| 25 |
assert response is not None
|
| 26 |
assert isinstance(response, SearchResponse)
|
|
|
|
| 41 |
assert usage_after == usage_before + 1
|
| 42 |
|
| 43 |
@pytest.mark.expensive
|
| 44 |
+
@pytest.mark.asyncio
|
| 45 |
+
async def test_linkup_search_async():
|
| 46 |
|
| 47 |
balance_before = get_linkup_balance()
|
| 48 |
print(balance_before)
|
| 49 |
|
| 50 |
+
response = await async_linkup_search(
|
| 51 |
+
search_query="Would it be possible to make a thermal reactor with graphite and lead?",
|
|
|
|
|
|
|
| 52 |
)
|
| 53 |
print(response.answer)
|
| 54 |
assert response is not None
|
uv.lock
CHANGED
|
@@ -514,11 +514,15 @@ dependencies = [
|
|
| 514 |
{ name = "datasets" },
|
| 515 |
{ name = "fasttext-wheel" },
|
| 516 |
{ name = "gradio" },
|
|
|
|
| 517 |
{ name = "langchain" },
|
| 518 |
{ name = "litellm" },
|
|
|
|
| 519 |
{ name = "open-deep-research" },
|
| 520 |
{ name = "openai" },
|
| 521 |
{ name = "pillow" },
|
|
|
|
|
|
|
| 522 |
{ name = "python-dotenv" },
|
| 523 |
{ name = "smolagents" },
|
| 524 |
{ name = "transformers" },
|
|
@@ -531,11 +535,15 @@ requires-dist = [
|
|
| 531 |
{ name = "datasets" },
|
| 532 |
{ name = "fasttext-wheel" },
|
| 533 |
{ name = "gradio" },
|
|
|
|
| 534 |
{ name = "langchain" },
|
| 535 |
{ name = "litellm" },
|
|
|
|
| 536 |
{ name = "open-deep-research", git = "https://github.com/langchain-ai/open_deep_research" },
|
| 537 |
{ name = "openai" },
|
| 538 |
{ name = "pillow" },
|
|
|
|
|
|
|
| 539 |
{ name = "python-dotenv", specifier = ">=1.1.1" },
|
| 540 |
{ name = "smolagents", specifier = ">=1.19.0" },
|
| 541 |
{ name = "transformers" },
|
|
@@ -574,6 +582,15 @@ wheels = [
|
|
| 574 |
{ url = "https://files.pythonhosted.org/packages/6a/28/bd84c58623bc7a817b30ae8ba23356f8cfca8577969c15c7133ac29db1e4/duckduckgo_search-8.0.5-py3-none-any.whl", hash = "sha256:c9f18cb8f8311b9005c8f8f699216dd124b01a797d97a2df725942c1af4c16fa", size = 18228 },
|
| 575 |
]
|
| 576 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 577 |
[[package]]
|
| 578 |
name = "exa-py"
|
| 579 |
version = "1.14.14"
|
|
@@ -1530,6 +1547,22 @@ wheels = [
|
|
| 1530 |
{ url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
|
| 1531 |
]
|
| 1532 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1533 |
[[package]]
|
| 1534 |
name = "msal"
|
| 1535 |
version = "1.32.3"
|
|
@@ -2116,6 +2149,15 @@ wheels = [
|
|
| 2116 |
{ url = "https://files.pythonhosted.org/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771 },
|
| 2117 |
]
|
| 2118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2119 |
[[package]]
|
| 2120 |
name = "pyperclip"
|
| 2121 |
version = "1.9.0"
|
|
@@ -2138,6 +2180,18 @@ wheels = [
|
|
| 2138 |
{ url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474 },
|
| 2139 |
]
|
| 2140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2141 |
[[package]]
|
| 2142 |
name = "python-dateutil"
|
| 2143 |
version = "2.9.0.post0"
|
|
|
|
| 514 |
{ name = "datasets" },
|
| 515 |
{ name = "fasttext-wheel" },
|
| 516 |
{ name = "gradio" },
|
| 517 |
+
{ name = "httpx" },
|
| 518 |
{ name = "langchain" },
|
| 519 |
{ name = "litellm" },
|
| 520 |
+
{ name = "mistralai" },
|
| 521 |
{ name = "open-deep-research" },
|
| 522 |
{ name = "openai" },
|
| 523 |
{ name = "pillow" },
|
| 524 |
+
{ name = "pypdf" },
|
| 525 |
+
{ name = "pytest-asyncio" },
|
| 526 |
{ name = "python-dotenv" },
|
| 527 |
{ name = "smolagents" },
|
| 528 |
{ name = "transformers" },
|
|
|
|
| 535 |
{ name = "datasets" },
|
| 536 |
{ name = "fasttext-wheel" },
|
| 537 |
{ name = "gradio" },
|
| 538 |
+
{ name = "httpx" },
|
| 539 |
{ name = "langchain" },
|
| 540 |
{ name = "litellm" },
|
| 541 |
+
{ name = "mistralai", specifier = ">=1.9.1" },
|
| 542 |
{ name = "open-deep-research", git = "https://github.com/langchain-ai/open_deep_research" },
|
| 543 |
{ name = "openai" },
|
| 544 |
{ name = "pillow" },
|
| 545 |
+
{ name = "pypdf" },
|
| 546 |
+
{ name = "pytest-asyncio", specifier = ">=1.0.0" },
|
| 547 |
{ name = "python-dotenv", specifier = ">=1.1.1" },
|
| 548 |
{ name = "smolagents", specifier = ">=1.19.0" },
|
| 549 |
{ name = "transformers" },
|
|
|
|
| 582 |
{ url = "https://files.pythonhosted.org/packages/6a/28/bd84c58623bc7a817b30ae8ba23356f8cfca8577969c15c7133ac29db1e4/duckduckgo_search-8.0.5-py3-none-any.whl", hash = "sha256:c9f18cb8f8311b9005c8f8f699216dd124b01a797d97a2df725942c1af4c16fa", size = 18228 },
|
| 583 |
]
|
| 584 |
|
| 585 |
+
[[package]]
|
| 586 |
+
name = "eval-type-backport"
|
| 587 |
+
version = "0.2.2"
|
| 588 |
+
source = { registry = "https://pypi.org/simple" }
|
| 589 |
+
sdist = { url = "https://files.pythonhosted.org/packages/30/ea/8b0ac4469d4c347c6a385ff09dc3c048c2d021696664e26c7ee6791631b5/eval_type_backport-0.2.2.tar.gz", hash = "sha256:f0576b4cf01ebb5bd358d02314d31846af5e07678387486e2c798af0e7d849c1", size = 9079 }
|
| 590 |
+
wheels = [
|
| 591 |
+
{ url = "https://files.pythonhosted.org/packages/ce/31/55cd413eaccd39125368be33c46de24a1f639f2e12349b0361b4678f3915/eval_type_backport-0.2.2-py3-none-any.whl", hash = "sha256:cb6ad7c393517f476f96d456d0412ea80f0a8cf96f6892834cd9340149111b0a", size = 5830 },
|
| 592 |
+
]
|
| 593 |
+
|
| 594 |
[[package]]
|
| 595 |
name = "exa-py"
|
| 596 |
version = "1.14.14"
|
|
|
|
| 1547 |
{ url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
|
| 1548 |
]
|
| 1549 |
|
| 1550 |
+
[[package]]
|
| 1551 |
+
name = "mistralai"
|
| 1552 |
+
version = "1.9.1"
|
| 1553 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1554 |
+
dependencies = [
|
| 1555 |
+
{ name = "eval-type-backport" },
|
| 1556 |
+
{ name = "httpx" },
|
| 1557 |
+
{ name = "pydantic" },
|
| 1558 |
+
{ name = "python-dateutil" },
|
| 1559 |
+
{ name = "typing-inspection" },
|
| 1560 |
+
]
|
| 1561 |
+
sdist = { url = "https://files.pythonhosted.org/packages/8c/d9/e704bb05f0eb5c6726cbbb0c9c16542553b4cdb6b4ccaf45b8c033105738/mistralai-1.9.1.tar.gz", hash = "sha256:89eb1d48e9555c8289c02ddea966115eba0516355731726ea0a24eabb42f8419", size = 182308 }
|
| 1562 |
+
wheels = [
|
| 1563 |
+
{ url = "https://files.pythonhosted.org/packages/10/a2/2e177165a24d978f07cf5d5841265ab399c187b0a44077d67502b8129b27/mistralai-1.9.1-py3-none-any.whl", hash = "sha256:250ec26534db6f4a4d5e6292b0801a64da2ab1f0d4c63a20d8ce27e3a427e402", size = 381773 },
|
| 1564 |
+
]
|
| 1565 |
+
|
| 1566 |
[[package]]
|
| 1567 |
name = "msal"
|
| 1568 |
version = "1.32.3"
|
|
|
|
| 2149 |
{ url = "https://files.pythonhosted.org/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771 },
|
| 2150 |
]
|
| 2151 |
|
| 2152 |
+
[[package]]
|
| 2153 |
+
name = "pypdf"
|
| 2154 |
+
version = "5.7.0"
|
| 2155 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2156 |
+
sdist = { url = "https://files.pythonhosted.org/packages/7b/42/fbc37af367b20fa6c53da81b1780025f6046a0fac8cbf0663a17e743b033/pypdf-5.7.0.tar.gz", hash = "sha256:68c92f2e1aae878bab1150e74447f31ab3848b1c0a6f8becae9f0b1904460b6f", size = 5026120 }
|
| 2157 |
+
wheels = [
|
| 2158 |
+
{ url = "https://files.pythonhosted.org/packages/73/9f/78d096ef795a813fa0e1cb9b33fa574b205f2b563d9c1e9366c854cf0364/pypdf-5.7.0-py3-none-any.whl", hash = "sha256:203379453439f5b68b7a1cd43cdf4c5f7a02b84810cefa7f93a47b350aaaba48", size = 305524 },
|
| 2159 |
+
]
|
| 2160 |
+
|
| 2161 |
[[package]]
|
| 2162 |
name = "pyperclip"
|
| 2163 |
version = "1.9.0"
|
|
|
|
| 2180 |
{ url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474 },
|
| 2181 |
]
|
| 2182 |
|
| 2183 |
+
[[package]]
|
| 2184 |
+
name = "pytest-asyncio"
|
| 2185 |
+
version = "1.0.0"
|
| 2186 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2187 |
+
dependencies = [
|
| 2188 |
+
{ name = "pytest" },
|
| 2189 |
+
]
|
| 2190 |
+
sdist = { url = "https://files.pythonhosted.org/packages/d0/d4/14f53324cb1a6381bef29d698987625d80052bb33932d8e7cbf9b337b17c/pytest_asyncio-1.0.0.tar.gz", hash = "sha256:d15463d13f4456e1ead2594520216b225a16f781e144f8fdf6c5bb4667c48b3f", size = 46960 }
|
| 2191 |
+
wheels = [
|
| 2192 |
+
{ url = "https://files.pythonhosted.org/packages/30/05/ce271016e351fddc8399e546f6e23761967ee09c8c568bbfbecb0c150171/pytest_asyncio-1.0.0-py3-none-any.whl", hash = "sha256:4f024da9f1ef945e680dc68610b52550e36590a67fd31bb3b4943979a1f90ef3", size = 15976 },
|
| 2193 |
+
]
|
| 2194 |
+
|
| 2195 |
[[package]]
|
| 2196 |
name = "python-dateutil"
|
| 2197 |
version = "2.9.0.post0"
|