Spaces:

charles-azam
/

deepdraft

Runtime error

App Files Files Community

Charles Azam commited on Jul 4

Commit

f1368c4

1 Parent(s): b1060b0

feat: add crawling functions

Browse files

Files changed (12) hide show

.vscode/settings.json +7 -0
data/{linkup_response.json → answers/linkup_response.json} +0 -0
data/{tavily_response.json → answers/tavily_response.json} +0 -0
pyproject.toml +4 -0
src/deepengineer/common_path.py +11 -0
src/deepengineer/webcrawler/async_crawl.py +31 -13
src/deepengineer/webcrawler/async_search.py +30 -22
src/deepengineer/webcrawler/pdf_tools.py +130 -0
src/deepengineer/webcrawler/testing.py +20 -0
tests/webcrawler/test_async_crawl.py +44 -0
tests/webcrawler/{test_utils.py → test_async_search.py} +9 -10
uv.lock +54 -0

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "python.testing.pytestArgs": [
+        "tests"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
+}

data/{linkup_response.json → answers/linkup_response.json} RENAMED Viewed

File without changes

data/{tavily_response.json → answers/tavily_response.json} RENAMED Viewed

File without changes

pyproject.toml CHANGED Viewed

@@ -21,6 +21,10 @@ dependencies = [
     "gradio",
     "open-deep-research",
     "python-dotenv>=1.1.1",
 ]
 [project.scripts]

     "gradio",
     "open-deep-research",
     "python-dotenv>=1.1.1",
+    "httpx",
+    "pypdf",
+    "pytest-asyncio>=1.0.0",
+    "mistralai>=1.9.1",
 ]
 [project.scripts]

src/deepengineer/common_path.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from pathlib import Path
+DEEPENGINEER_CODE_DIR = Path(__file__).parent
+DEEPENGINEER_SRC_DIR = DEEPENGINEER_CODE_DIR.parent
+DEEPENGINEER_ROOT_DIR = DEEPENGINEER_SRC_DIR.parent
+assert DEEPENGINEER_CODE_DIR.name == "deepengineer"
+assert DEEPENGINEER_SRC_DIR.name == "src"
+DATA_DIR = DEEPENGINEER_ROOT_DIR / "data"
+assert DATA_DIR.exists()

src/deepengineer/webcrawler/async_crawl.py CHANGED Viewed

@@ -1,15 +1,33 @@
-async def tavily_extract_async():
-    pass
-async def tavily_crawl_async():
-    pass
-async def crawl4ai_extract_async():
-    pass
-async def crawl4ai_crawl_async():
-    pass

+import aiofiles
+import httpx
+import crawl4ai
+import os
+from pathlib import Path
+async def crawl4ai_extract_markdown_of_url_async(url: str) -> str:
+    """Extract markdown content from a URL using crawl4ai."""
+    async with crawl4ai.AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url=url)
+        return result.markdown
+async def download_pdf_async(url: str, output_path: Path) -> str:
+    """Download a PDF file from a URL."""
+    timeout = httpx.Timeout(30.0, connect=10.0)
+    async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
+        response = await client.get(url)
+        response.raise_for_status()
+    async with aiofiles.open(output_path, "wb") as f:
+        await f.write(response.content)
+    return output_path
+async def arxiv_download_pdf_async(url: str, output_path: Path) -> str:
+    """Download a PDF from arXiv by converting the abstract URL to PDF URL."""
+    # Extract the arXiv ID from the URL
+    if "/abs/" in url:
+        arxiv_id = url.split("/abs/")[1].rstrip("/")
+        pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
+    else:
+        # If it's already a PDF URL, use it as is
+        pdf_url = url
+    return await download_pdf_async(pdf_url, output_path)

src/deepengineer/webcrawler/async_search.py CHANGED Viewed

@@ -3,6 +3,7 @@ import asyncio
 import requests
 from pydantic import BaseModel, Field
 from typing import List, Optional, Literal
 from linkup import LinkupClient, LinkupSourcedAnswer
 from tavily import AsyncTavilyClient
@@ -23,6 +24,11 @@ class SearchResponse(BaseModel):
     answer: str | None = Field(None, description="Direct answer from the search API if available")
     search_results: list[SearchResult] = Field(default_factory=list, description="List of search results")
 def get_tavily_usage():
     url = "https://api.tavily.com/usage"
@@ -39,7 +45,8 @@ async def tavily_search_async(
     include_answer: Literal["basic", "advanced"] | None = "advanced",
     include_raw_content: Literal["text", "markdown"] | None = "markdown",
     include_images: bool = False,
-    search_depth: Literal['basic', 'advanced'] | None = "basic"
 ) -> SearchResponse:
     """
     Performs concurrent web searches with the Tavily API
@@ -52,7 +59,8 @@ async def tavily_search_async(
         include_answer=include_answer,
         include_raw_content=include_raw_content,
         max_results=max_results,
-        include_images=include_images
     )
     search_results = [
@@ -90,6 +98,7 @@ async def async_linkup_search(
     depth: Literal["standard", "deep"] = "standard",
     output_type: Literal['searchResults', 'sourcedAnswer', 'structured'] = "sourcedAnswer",
     include_images: bool = False,
 ) -> SearchResponse:
     """
     Performs concurrent web searches using the Linkup API.
@@ -100,11 +109,10 @@ async def async_linkup_search(
         query=search_query,
         depth=depth,
         output_type=output_type,
-        include_images=include_images
     )
     search_results = [
         SearchResult(
             title=result.name,
@@ -126,28 +134,28 @@ async def async_linkup_search(
-class ArxivSearchParams(BaseModel):
-    """Parameters for arXiv search."""
-    load_max_docs: int = Field(default=5, ge=1, le=20, description="Maximum number of documents to return per query")
-    get_full_documents: bool = Field(default=True, description="Whether to fetch full text of documents")
-    load_all_available_meta: bool = Field(default=True, description="Whether to load all available metadata")
-class PubMedSearchParams(BaseModel):
-    """Parameters for PubMed search."""
-    top_k_results: int = Field(default=5, ge=1, le=20, description="Maximum number of documents to return per query")
-    email: Optional[str] = Field(None, description="Email address for PubMed API. Required by NCBI.")
-    api_key: Optional[str] = Field(None, description="API key for PubMed API for higher rate limits")
-    doc_content_chars_max: int = Field(default=4000, ge=100, le=10000, description="Maximum characters for document content")
 async def arxiv_search_async(
     search_query: str,
 ) -> SearchResponse:
-    raise NotImplementedError("Arxiv search is not implemented yet")
 async def pubmed_search_async(
-    query: str,
 ) -> SearchResponse:
-    raise NotImplementedError("PubMed search is not implemented yet")

 import requests
 from pydantic import BaseModel, Field
 from typing import List, Optional, Literal
+from enum import Enum
 from linkup import LinkupClient, LinkupSourcedAnswer
 from tavily import AsyncTavilyClient
     answer: str | None = Field(None, description="Direct answer from the search API if available")
     search_results: list[SearchResult] = Field(default_factory=list, description="List of search results")
+class ScientificDomains(str, Enum):
+    wikipedia = "wikipedia.org"
+    arxiv = "arxiv.org"
+    pubmed = "pubmed.ncbi.nlm.nih.gov"
+    sciencedirect = "sciencedirect.com"
 def get_tavily_usage():
     url = "https://api.tavily.com/usage"
     include_answer: Literal["basic", "advanced"] | None = "advanced",
     include_raw_content: Literal["text", "markdown"] | None = "markdown",
     include_images: bool = False,
+    search_depth: Literal['basic', 'advanced'] | None = "basic",
+    include_domains: list[ScientificDomains] = None,
 ) -> SearchResponse:
     """
     Performs concurrent web searches with the Tavily API
         include_answer=include_answer,
         include_raw_content=include_raw_content,
         max_results=max_results,
+        include_images=include_images,
+        include_domains=include_domains,
     )
     search_results = [
     depth: Literal["standard", "deep"] = "standard",
     output_type: Literal['searchResults', 'sourcedAnswer', 'structured'] = "sourcedAnswer",
     include_images: bool = False,
+    include_domains: list[ScientificDomains] = None,
 ) -> SearchResponse:
     """
     Performs concurrent web searches using the Linkup API.
         query=search_query,
         depth=depth,
         output_type=output_type,
+        include_images=include_images,
+        include_domains=include_domains,
     )
     search_results = [
         SearchResult(
             title=result.name,
 async def arxiv_search_async(
     search_query: str,
 ) -> SearchResponse:
+    response = await async_linkup_search(search_query, include_domains=[ScientificDomains.arxiv])
+    return response
 async def pubmed_search_async(
+    search_query: str,
+) -> SearchResponse:
+    response = await async_linkup_search(search_query, include_domains=[ScientificDomains.pubmed])
+    return response
+async def sciencedirect_search_async(
+    search_query: str,
+) -> SearchResponse:
+    response = await async_linkup_search(search_query, include_domains=[ScientificDomains.sciencedirect])
+    return response
+async def scientific_search_async(
+    search_query: str,
 ) -> SearchResponse:
+    response = await async_linkup_search(search_query, include_domains=[ScientificDomains.wikipedia, ScientificDomains.arxiv, ScientificDomains.pubmed, ScientificDomains.sciencedirect])
+    return response

src/deepengineer/webcrawler/pdf_tools.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import os
+from pathlib import Path
+from pypdf import PdfReader, PdfWriter
+import io
+from pathlib import Path
+from mistralai import Mistral
+import os
+from litellm import completion
+from mistralai.models import OCRResponse
+import yaml
+from tenacity import retry, stop_after_attempt, wait_fixed, RetryError
+from litellm.exceptions import BadRequestError
+# Define the size limit in bytes
+MAX_SIZE_BYTES = 49 * 1024 * 1024
+async def convert_pdf_to_markdown_async(
+    pdf_path: Path,
+    with_image_description: bool = False,
+) -> tuple[str, OCRResponse]:
+    mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
+    uploaded_pdf = await mistral_client.files.upload_async(
+        file={
+            "file_name": "uploaded_file.pdf",
+            "content": open(pdf_path, "rb"),
+        },
+        purpose="ocr",
+    )
+    signed_url = await mistral_client.files.get_signed_url_async(file_id=uploaded_pdf.id)
+    ocr_response = await mistral_client.ocr.process_async(
+        model="mistral-ocr-latest",
+        document={"type": "document_url", "document_url": signed_url.url},
+        include_image_base64=True,
+    )
+    print(f"Processing PDF: {pdf_path.name}")
+    return (
+        _get_combined_markdown(
+            ocr_response=ocr_response, with_image_description=with_image_description
+        ),
+        ocr_response,
+    )
+def _get_image_description_using_llm(
+    base_64_str: str, model: str = "mistral/mistral-small-latest"
+) -> str | None:
+    assert base_64_str.startswith("data:image/jpeg;base64")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Describe this image in detail:"},
+                {"type": "image_url", "image_url": {"url": base_64_str}},
+            ],
+        }
+    ]
+    try:
+        response = completion(
+            model=model,  # LiteLLM naming convention
+            messages=messages,
+            temperature=0.0,
+            stream=False,
+        )
+        output = dict(response)["choices"][0].message.content
+    except BadRequestError:
+        output = ""
+    return output
+def _replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
+    """
+    Replace image placeholders in markdown with base64-encoded images.
+    Args:
+        markdown_str: Markdown text containing image placeholders
+        images_dict: Dictionary mapping image IDs to base64 strings
+    Returns:
+        Markdown text with images replaced by base64 data
+    """
+    for img_name, base64_str in images_dict.items():
+        print(f"Processing image: {img_name}")
+        try:
+            image_description = _get_image_description_using_llm(base_64_str=base64_str)
+        except RetryError:
+            image_description = "Image not found"
+        formatted_description = f"""> [Image {img_name} Replaced with Description Below]
+> {image_description.replace('\n', '\n> ')}
+"""
+        markdown_str = markdown_str.replace(
+            f"![{img_name}]({img_name})", formatted_description
+        )
+    return markdown_str
+def _get_combined_markdown(
+    ocr_response: OCRResponse, with_image_description: bool
+) -> str:
+    """
+    Combine OCR text and images into a single markdown document.
+    Args:
+        ocr_response: Response from OCR processing containing text and images
+    Returns:
+        Combined markdown string with embedded images
+    """
+    markdowns: list[str] = []
+    # Extract images from page
+    for page in ocr_response.pages:
+        # Replace image placeholders with actual images
+        if with_image_description:
+            image_data = {}
+            for img in page.images:
+                image_data[img.id] = img.image_base64
+            page_description = _replace_images_in_markdown(page.markdown, image_data)
+        else:
+            page_description = page.markdown
+        markdowns.append(page_description)
+    return "\n\n".join(markdowns)

src/deepengineer/webcrawler/testing.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from deepengineer.common_path import DATA_DIR
+from deepengineer.webcrawler.async_search import SearchResponse
+LINKUP_RESPONSE_FILE = DATA_DIR / "answers" / "linkup_response.json"
+TAVILY_RESPONSE_FILE = DATA_DIR / "answers" / "tavily_response.json"
+def load_linkup_response() -> SearchResponse:
+    with open(LINKUP_RESPONSE_FILE, "r") as f:
+        return SearchResponse.model_validate_json(f.read())
+def load_tavily_response() -> SearchResponse:
+    with open(TAVILY_RESPONSE_FILE, "r") as f:
+        return SearchResponse.model_validate_json(f.read())
+URL_WIKIPEDIA = "https://en.wikipedia.org/wiki/Graphite-moderated_reactor"
+URL_PDF = "https://arxiv.org/pdf/1301.1699.pdf"
+ARXIV_URL = "https://arxiv.org/abs/1301.1699"
+PUBMED_URL = "https://pubmed.ncbi.nlm.nih.gov/34100000/"
+SCIENCEDIRECT_URL = "https://www.sciencedirect.com/science/article/abs/pii/0168900289901964"

tests/webcrawler/test_async_crawl.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import pytest
+from deepengineer.webcrawler.async_crawl import (
+    crawl4ai_extract_markdown_of_url_async,
+    download_pdf_async,
+    arxiv_download_pdf_async,
+)
+from mistralai import OCRResponse
+from deepengineer.webcrawler.pdf_tools import convert_pdf_to_markdown_async
+from deepengineer.webcrawler.testing import URL_WIKIPEDIA, URL_PDF, ARXIV_URL
+from deepengineer.common_path import DATA_DIR
+@pytest.mark.asyncio
+async def test_crawl4ai_extract_markdown_of_url_async():
+    markdown = await crawl4ai_extract_markdown_of_url_async(URL_WIKIPEDIA)
+    assert isinstance(markdown, str)
+    assert "Graphite-moderated reactor" in markdown
+@pytest.mark.asyncio
+async def test_download_pdf_async():
+    output_path = DATA_DIR / "temp.pdf"
+    output_path.unlink(missing_ok=True)
+    pdf_path = await download_pdf_async(URL_PDF, output_path=output_path)
+    assert pdf_path == output_path
+    assert output_path.exists()
+@pytest.mark.asyncio
+async def test_arxiv_download_pdf_async():
+    output_path = DATA_DIR / "temp.pdf"
+    output_path.unlink(missing_ok=True)
+    assert not output_path.exists()
+    pdf_path = await arxiv_download_pdf_async(ARXIV_URL, output_path=output_path)
+    assert pdf_path == output_path
+    assert output_path.exists()
+@pytest.mark.expensive
+@pytest.mark.asyncio
+async def test_convert_pdf_to_markdown_async():
+    pdf_path = DATA_DIR / "report_thermal_neutron.pdf"
+    assert pdf_path.exists()
+    markdown, ocr_response = await convert_pdf_to_markdown_async(pdf_path)
+    assert isinstance(ocr_response, OCRResponse)
+    assert len(ocr_response.pages) == 16
+    assert isinstance(markdown, str)
+    assert "where each cylinder represent" in markdown

tests/webcrawler/{test_utils.py → test_async_search.py} RENAMED Viewed

@@ -10,17 +10,17 @@ from deepengineer.webcrawler.async_search import (
 @pytest.mark.expensive
-def test_tavily_search_async():
     usage_before = get_tavily_usage()
     print(usage_before)
-    response = asyncio.run(
-        tavily_search_async(
-            search_query="Would it be possible to make a thermal reactor with graphite and lead?",
-        )
     )
     print(response.answer)
     assert response is not None
     assert isinstance(response, SearchResponse)
@@ -41,15 +41,14 @@ def test_tavily_search_async():
     assert usage_after == usage_before + 1
 @pytest.mark.expensive
-def test_linkup_search_async():
     balance_before = get_linkup_balance()
     print(balance_before)
-    response = asyncio.run(
-        async_linkup_search(
-            search_query="Would it be possible to make a thermal reactor with graphite and lead?",
-        )
     )
     print(response.answer)
     assert response is not None

 @pytest.mark.expensive
+@pytest.mark.asyncio
+async def test_tavily_search_async():
     usage_before = get_tavily_usage()
     print(usage_before)
+    response = await tavily_search_async(
+        search_query="Would it be possible to make a thermal reactor with graphite and lead?",
     )
     print(response.answer)
     assert response is not None
     assert isinstance(response, SearchResponse)
     assert usage_after == usage_before + 1
 @pytest.mark.expensive
+@pytest.mark.asyncio
+async def test_linkup_search_async():
     balance_before = get_linkup_balance()
     print(balance_before)
+    response = await async_linkup_search(
+        search_query="Would it be possible to make a thermal reactor with graphite and lead?",
     )
     print(response.answer)
     assert response is not None

uv.lock CHANGED Viewed

@@ -514,11 +514,15 @@ dependencies = [
     { name = "datasets" },
     { name = "fasttext-wheel" },
     { name = "gradio" },
     { name = "langchain" },
     { name = "litellm" },
     { name = "open-deep-research" },
     { name = "openai" },
     { name = "pillow" },
     { name = "python-dotenv" },
     { name = "smolagents" },
     { name = "transformers" },
@@ -531,11 +535,15 @@ requires-dist = [
     { name = "datasets" },
     { name = "fasttext-wheel" },
     { name = "gradio" },
     { name = "langchain" },
     { name = "litellm" },
     { name = "open-deep-research", git = "https://github.com/langchain-ai/open_deep_research" },
     { name = "openai" },
     { name = "pillow" },
     { name = "python-dotenv", specifier = ">=1.1.1" },
     { name = "smolagents", specifier = ">=1.19.0" },
     { name = "transformers" },
@@ -574,6 +582,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6a/28/bd84c58623bc7a817b30ae8ba23356f8cfca8577969c15c7133ac29db1e4/duckduckgo_search-8.0.5-py3-none-any.whl", hash = "sha256:c9f18cb8f8311b9005c8f8f699216dd124b01a797d97a2df725942c1af4c16fa", size = 18228 },
 ]
 [[package]]
 name = "exa-py"
 version = "1.14.14"
@@ -1530,6 +1547,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
 ]
 [[package]]
 name = "msal"
 version = "1.32.3"
@@ -2116,6 +2149,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771 },
 ]
 [[package]]
 name = "pyperclip"
 version = "1.9.0"
@@ -2138,6 +2180,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474 },
 ]
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"

     { name = "datasets" },
     { name = "fasttext-wheel" },
     { name = "gradio" },
+    { name = "httpx" },
     { name = "langchain" },
     { name = "litellm" },
+    { name = "mistralai" },
     { name = "open-deep-research" },
     { name = "openai" },
     { name = "pillow" },
+    { name = "pypdf" },
+    { name = "pytest-asyncio" },
     { name = "python-dotenv" },
     { name = "smolagents" },
     { name = "transformers" },
     { name = "datasets" },
     { name = "fasttext-wheel" },
     { name = "gradio" },
+    { name = "httpx" },
     { name = "langchain" },
     { name = "litellm" },
+    { name = "mistralai", specifier = ">=1.9.1" },
     { name = "open-deep-research", git = "https://github.com/langchain-ai/open_deep_research" },
     { name = "openai" },
     { name = "pillow" },
+    { name = "pypdf" },
+    { name = "pytest-asyncio", specifier = ">=1.0.0" },
     { name = "python-dotenv", specifier = ">=1.1.1" },
     { name = "smolagents", specifier = ">=1.19.0" },
     { name = "transformers" },
     { url = "https://files.pythonhosted.org/packages/6a/28/bd84c58623bc7a817b30ae8ba23356f8cfca8577969c15c7133ac29db1e4/duckduckgo_search-8.0.5-py3-none-any.whl", hash = "sha256:c9f18cb8f8311b9005c8f8f699216dd124b01a797d97a2df725942c1af4c16fa", size = 18228 },
 ]
+[[package]]
+name = "eval-type-backport"
+version = "0.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/30/ea/8b0ac4469d4c347c6a385ff09dc3c048c2d021696664e26c7ee6791631b5/eval_type_backport-0.2.2.tar.gz", hash = "sha256:f0576b4cf01ebb5bd358d02314d31846af5e07678387486e2c798af0e7d849c1", size = 9079 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ce/31/55cd413eaccd39125368be33c46de24a1f639f2e12349b0361b4678f3915/eval_type_backport-0.2.2-py3-none-any.whl", hash = "sha256:cb6ad7c393517f476f96d456d0412ea80f0a8cf96f6892834cd9340149111b0a", size = 5830 },
+]
 [[package]]
 name = "exa-py"
 version = "1.14.14"
     { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
 ]
+[[package]]
+name = "mistralai"
+version = "1.9.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "eval-type-backport" },
+    { name = "httpx" },
+    { name = "pydantic" },
+    { name = "python-dateutil" },
+    { name = "typing-inspection" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/8c/d9/e704bb05f0eb5c6726cbbb0c9c16542553b4cdb6b4ccaf45b8c033105738/mistralai-1.9.1.tar.gz", hash = "sha256:89eb1d48e9555c8289c02ddea966115eba0516355731726ea0a24eabb42f8419", size = 182308 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/10/a2/2e177165a24d978f07cf5d5841265ab399c187b0a44077d67502b8129b27/mistralai-1.9.1-py3-none-any.whl", hash = "sha256:250ec26534db6f4a4d5e6292b0801a64da2ab1f0d4c63a20d8ce27e3a427e402", size = 381773 },
+]
 [[package]]
 name = "msal"
 version = "1.32.3"
     { url = "https://files.pythonhosted.org/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771 },
 ]
+[[package]]
+name = "pypdf"
+version = "5.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7b/42/fbc37af367b20fa6c53da81b1780025f6046a0fac8cbf0663a17e743b033/pypdf-5.7.0.tar.gz", hash = "sha256:68c92f2e1aae878bab1150e74447f31ab3848b1c0a6f8becae9f0b1904460b6f", size = 5026120 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/73/9f/78d096ef795a813fa0e1cb9b33fa574b205f2b563d9c1e9366c854cf0364/pypdf-5.7.0-py3-none-any.whl", hash = "sha256:203379453439f5b68b7a1cd43cdf4c5f7a02b84810cefa7f93a47b350aaaba48", size = 305524 },
+]
 [[package]]
 name = "pyperclip"
 version = "1.9.0"
     { url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474 },
 ]
+[[package]]
+name = "pytest-asyncio"
+version = "1.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d0/d4/14f53324cb1a6381bef29d698987625d80052bb33932d8e7cbf9b337b17c/pytest_asyncio-1.0.0.tar.gz", hash = "sha256:d15463d13f4456e1ead2594520216b225a16f781e144f8fdf6c5bb4667c48b3f", size = 46960 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/30/05/ce271016e351fddc8399e546f6e23761967ee09c8c568bbfbecb0c150171/pytest_asyncio-1.0.0-py3-none-any.whl", hash = "sha256:4f024da9f1ef945e680dc68610b52550e36590a67fd31bb3b4943979a1f90ef3", size = 15976 },
+]
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"