Charles Azam commited on
Commit
f1368c4
Β·
1 Parent(s): b1060b0

feat: add crawling functions

Browse files
.vscode/settings.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "python.testing.pytestArgs": [
3
+ "tests"
4
+ ],
5
+ "python.testing.unittestEnabled": false,
6
+ "python.testing.pytestEnabled": true
7
+ }
data/{linkup_response.json β†’ answers/linkup_response.json} RENAMED
File without changes
data/{tavily_response.json β†’ answers/tavily_response.json} RENAMED
File without changes
pyproject.toml CHANGED
@@ -21,6 +21,10 @@ dependencies = [
21
  "gradio",
22
  "open-deep-research",
23
  "python-dotenv>=1.1.1",
 
 
 
 
24
  ]
25
 
26
  [project.scripts]
 
21
  "gradio",
22
  "open-deep-research",
23
  "python-dotenv>=1.1.1",
24
+ "httpx",
25
+ "pypdf",
26
+ "pytest-asyncio>=1.0.0",
27
+ "mistralai>=1.9.1",
28
  ]
29
 
30
  [project.scripts]
src/deepengineer/common_path.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ DEEPENGINEER_CODE_DIR = Path(__file__).parent
4
+ DEEPENGINEER_SRC_DIR = DEEPENGINEER_CODE_DIR.parent
5
+ DEEPENGINEER_ROOT_DIR = DEEPENGINEER_SRC_DIR.parent
6
+
7
+ assert DEEPENGINEER_CODE_DIR.name == "deepengineer"
8
+ assert DEEPENGINEER_SRC_DIR.name == "src"
9
+
10
+ DATA_DIR = DEEPENGINEER_ROOT_DIR / "data"
11
+ assert DATA_DIR.exists()
src/deepengineer/webcrawler/async_crawl.py CHANGED
@@ -1,15 +1,33 @@
1
- async def tavily_extract_async():
2
- pass
3
-
4
- async def tavily_crawl_async():
5
- pass
6
-
7
- async def crawl4ai_extract_async():
8
- pass
9
-
10
- async def crawl4ai_crawl_async():
11
- pass
12
-
13
-
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import aiofiles
2
+ import httpx
3
+ import crawl4ai
4
+ import os
5
+ from pathlib import Path
 
 
 
 
 
 
 
 
6
 
7
+ async def crawl4ai_extract_markdown_of_url_async(url: str) -> str:
8
+ """Extract markdown content from a URL using crawl4ai."""
9
+ async with crawl4ai.AsyncWebCrawler() as crawler:
10
+ result = await crawler.arun(url=url)
11
+ return result.markdown
12
+
13
+ async def download_pdf_async(url: str, output_path: Path) -> str:
14
+ """Download a PDF file from a URL."""
15
+ timeout = httpx.Timeout(30.0, connect=10.0)
16
+ async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
17
+ response = await client.get(url)
18
+ response.raise_for_status()
19
+ async with aiofiles.open(output_path, "wb") as f:
20
+ await f.write(response.content)
21
+ return output_path
22
 
23
+ async def arxiv_download_pdf_async(url: str, output_path: Path) -> str:
24
+ """Download a PDF from arXiv by converting the abstract URL to PDF URL."""
25
+ # Extract the arXiv ID from the URL
26
+ if "/abs/" in url:
27
+ arxiv_id = url.split("/abs/")[1].rstrip("/")
28
+ pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
29
+ else:
30
+ # If it's already a PDF URL, use it as is
31
+ pdf_url = url
32
+
33
+ return await download_pdf_async(pdf_url, output_path)
src/deepengineer/webcrawler/async_search.py CHANGED
@@ -3,6 +3,7 @@ import asyncio
3
  import requests
4
  from pydantic import BaseModel, Field
5
  from typing import List, Optional, Literal
 
6
 
7
  from linkup import LinkupClient, LinkupSourcedAnswer
8
  from tavily import AsyncTavilyClient
@@ -23,6 +24,11 @@ class SearchResponse(BaseModel):
23
  answer: str | None = Field(None, description="Direct answer from the search API if available")
24
  search_results: list[SearchResult] = Field(default_factory=list, description="List of search results")
25
 
 
 
 
 
 
26
 
27
  def get_tavily_usage():
28
  url = "https://api.tavily.com/usage"
@@ -39,7 +45,8 @@ async def tavily_search_async(
39
  include_answer: Literal["basic", "advanced"] | None = "advanced",
40
  include_raw_content: Literal["text", "markdown"] | None = "markdown",
41
  include_images: bool = False,
42
- search_depth: Literal['basic', 'advanced'] | None = "basic"
 
43
  ) -> SearchResponse:
44
  """
45
  Performs concurrent web searches with the Tavily API
@@ -52,7 +59,8 @@ async def tavily_search_async(
52
  include_answer=include_answer,
53
  include_raw_content=include_raw_content,
54
  max_results=max_results,
55
- include_images=include_images
 
56
  )
57
 
58
  search_results = [
@@ -90,6 +98,7 @@ async def async_linkup_search(
90
  depth: Literal["standard", "deep"] = "standard",
91
  output_type: Literal['searchResults', 'sourcedAnswer', 'structured'] = "sourcedAnswer",
92
  include_images: bool = False,
 
93
  ) -> SearchResponse:
94
  """
95
  Performs concurrent web searches using the Linkup API.
@@ -100,11 +109,10 @@ async def async_linkup_search(
100
  query=search_query,
101
  depth=depth,
102
  output_type=output_type,
103
- include_images=include_images
 
104
  )
105
 
106
-
107
-
108
  search_results = [
109
  SearchResult(
110
  title=result.name,
@@ -126,28 +134,28 @@ async def async_linkup_search(
126
 
127
 
128
 
129
- class ArxivSearchParams(BaseModel):
130
- """Parameters for arXiv search."""
131
- load_max_docs: int = Field(default=5, ge=1, le=20, description="Maximum number of documents to return per query")
132
- get_full_documents: bool = Field(default=True, description="Whether to fetch full text of documents")
133
- load_all_available_meta: bool = Field(default=True, description="Whether to load all available metadata")
134
-
135
-
136
- class PubMedSearchParams(BaseModel):
137
- """Parameters for PubMed search."""
138
- top_k_results: int = Field(default=5, ge=1, le=20, description="Maximum number of documents to return per query")
139
- email: Optional[str] = Field(None, description="Email address for PubMed API. Required by NCBI.")
140
- api_key: Optional[str] = Field(None, description="API key for PubMed API for higher rate limits")
141
- doc_content_chars_max: int = Field(default=4000, ge=100, le=10000, description="Maximum characters for document content")
142
-
143
 
144
  async def arxiv_search_async(
145
  search_query: str,
146
  ) -> SearchResponse:
147
- raise NotImplementedError("Arxiv search is not implemented yet")
 
148
 
149
 
150
  async def pubmed_search_async(
151
- query: str,
 
 
 
 
 
 
 
 
 
 
 
 
152
  ) -> SearchResponse:
153
- raise NotImplementedError("PubMed search is not implemented yet")
 
 
3
  import requests
4
  from pydantic import BaseModel, Field
5
  from typing import List, Optional, Literal
6
+ from enum import Enum
7
 
8
  from linkup import LinkupClient, LinkupSourcedAnswer
9
  from tavily import AsyncTavilyClient
 
24
  answer: str | None = Field(None, description="Direct answer from the search API if available")
25
  search_results: list[SearchResult] = Field(default_factory=list, description="List of search results")
26
 
27
+ class ScientificDomains(str, Enum):
28
+ wikipedia = "wikipedia.org"
29
+ arxiv = "arxiv.org"
30
+ pubmed = "pubmed.ncbi.nlm.nih.gov"
31
+ sciencedirect = "sciencedirect.com"
32
 
33
  def get_tavily_usage():
34
  url = "https://api.tavily.com/usage"
 
45
  include_answer: Literal["basic", "advanced"] | None = "advanced",
46
  include_raw_content: Literal["text", "markdown"] | None = "markdown",
47
  include_images: bool = False,
48
+ search_depth: Literal['basic', 'advanced'] | None = "basic",
49
+ include_domains: list[ScientificDomains] = None,
50
  ) -> SearchResponse:
51
  """
52
  Performs concurrent web searches with the Tavily API
 
59
  include_answer=include_answer,
60
  include_raw_content=include_raw_content,
61
  max_results=max_results,
62
+ include_images=include_images,
63
+ include_domains=include_domains,
64
  )
65
 
66
  search_results = [
 
98
  depth: Literal["standard", "deep"] = "standard",
99
  output_type: Literal['searchResults', 'sourcedAnswer', 'structured'] = "sourcedAnswer",
100
  include_images: bool = False,
101
+ include_domains: list[ScientificDomains] = None,
102
  ) -> SearchResponse:
103
  """
104
  Performs concurrent web searches using the Linkup API.
 
109
  query=search_query,
110
  depth=depth,
111
  output_type=output_type,
112
+ include_images=include_images,
113
+ include_domains=include_domains,
114
  )
115
 
 
 
116
  search_results = [
117
  SearchResult(
118
  title=result.name,
 
134
 
135
 
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  async def arxiv_search_async(
139
  search_query: str,
140
  ) -> SearchResponse:
141
+ response = await async_linkup_search(search_query, include_domains=[ScientificDomains.arxiv])
142
+ return response
143
 
144
 
145
  async def pubmed_search_async(
146
+ search_query: str,
147
+ ) -> SearchResponse:
148
+ response = await async_linkup_search(search_query, include_domains=[ScientificDomains.pubmed])
149
+ return response
150
+
151
+ async def sciencedirect_search_async(
152
+ search_query: str,
153
+ ) -> SearchResponse:
154
+ response = await async_linkup_search(search_query, include_domains=[ScientificDomains.sciencedirect])
155
+ return response
156
+
157
+ async def scientific_search_async(
158
+ search_query: str,
159
  ) -> SearchResponse:
160
+ response = await async_linkup_search(search_query, include_domains=[ScientificDomains.wikipedia, ScientificDomains.arxiv, ScientificDomains.pubmed, ScientificDomains.sciencedirect])
161
+ return response
src/deepengineer/webcrawler/pdf_tools.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from pypdf import PdfReader, PdfWriter
4
+ import io
5
+ from pathlib import Path
6
+ from mistralai import Mistral
7
+ import os
8
+ from litellm import completion
9
+
10
+ from mistralai.models import OCRResponse
11
+ import yaml
12
+ from tenacity import retry, stop_after_attempt, wait_fixed, RetryError
13
+ from litellm.exceptions import BadRequestError
14
+
15
+ # Define the size limit in bytes
16
+ MAX_SIZE_BYTES = 49 * 1024 * 1024
17
+
18
+
19
+ async def convert_pdf_to_markdown_async(
20
+ pdf_path: Path,
21
+ with_image_description: bool = False,
22
+ ) -> tuple[str, OCRResponse]:
23
+
24
+ mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
25
+
26
+ uploaded_pdf = await mistral_client.files.upload_async(
27
+ file={
28
+ "file_name": "uploaded_file.pdf",
29
+ "content": open(pdf_path, "rb"),
30
+ },
31
+ purpose="ocr",
32
+ )
33
+
34
+ signed_url = await mistral_client.files.get_signed_url_async(file_id=uploaded_pdf.id)
35
+
36
+ ocr_response = await mistral_client.ocr.process_async(
37
+ model="mistral-ocr-latest",
38
+ document={"type": "document_url", "document_url": signed_url.url},
39
+ include_image_base64=True,
40
+ )
41
+ print(f"Processing PDF: {pdf_path.name}")
42
+ return (
43
+ _get_combined_markdown(
44
+ ocr_response=ocr_response, with_image_description=with_image_description
45
+ ),
46
+ ocr_response,
47
+ )
48
+
49
+
50
+ def _get_image_description_using_llm(
51
+ base_64_str: str, model: str = "mistral/mistral-small-latest"
52
+ ) -> str | None:
53
+ assert base_64_str.startswith("data:image/jpeg;base64")
54
+
55
+ messages = [
56
+ {
57
+ "role": "user",
58
+ "content": [
59
+ {"type": "text", "text": "Describe this image in detail:"},
60
+ {"type": "image_url", "image_url": {"url": base_64_str}},
61
+ ],
62
+ }
63
+ ]
64
+ try:
65
+ response = completion(
66
+ model=model, # LiteLLM naming convention
67
+ messages=messages,
68
+ temperature=0.0,
69
+ stream=False,
70
+ )
71
+ output = dict(response)["choices"][0].message.content
72
+ except BadRequestError:
73
+ output = ""
74
+ return output
75
+
76
+
77
+ def _replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
78
+ """
79
+ Replace image placeholders in markdown with base64-encoded images.
80
+
81
+ Args:
82
+ markdown_str: Markdown text containing image placeholders
83
+ images_dict: Dictionary mapping image IDs to base64 strings
84
+
85
+ Returns:
86
+ Markdown text with images replaced by base64 data
87
+ """
88
+ for img_name, base64_str in images_dict.items():
89
+ print(f"Processing image: {img_name}")
90
+ try:
91
+ image_description = _get_image_description_using_llm(base_64_str=base64_str)
92
+ except RetryError:
93
+ image_description = "Image not found"
94
+ formatted_description = f"""> [Image {img_name} Replaced with Description Below]
95
+ > {image_description.replace('\n', '\n> ')}
96
+ """
97
+ markdown_str = markdown_str.replace(
98
+ f"![{img_name}]({img_name})", formatted_description
99
+ )
100
+ return markdown_str
101
+
102
+
103
+ def _get_combined_markdown(
104
+ ocr_response: OCRResponse, with_image_description: bool
105
+ ) -> str:
106
+ """
107
+ Combine OCR text and images into a single markdown document.
108
+
109
+ Args:
110
+ ocr_response: Response from OCR processing containing text and images
111
+
112
+ Returns:
113
+ Combined markdown string with embedded images
114
+ """
115
+
116
+ markdowns: list[str] = []
117
+ # Extract images from page
118
+ for page in ocr_response.pages:
119
+ # Replace image placeholders with actual images
120
+ if with_image_description:
121
+ image_data = {}
122
+ for img in page.images:
123
+ image_data[img.id] = img.image_base64
124
+ page_description = _replace_images_in_markdown(page.markdown, image_data)
125
+ else:
126
+ page_description = page.markdown
127
+ markdowns.append(page_description)
128
+
129
+ return "\n\n".join(markdowns)
130
+
src/deepengineer/webcrawler/testing.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from deepengineer.common_path import DATA_DIR
2
+ from deepengineer.webcrawler.async_search import SearchResponse
3
+
4
+ LINKUP_RESPONSE_FILE = DATA_DIR / "answers" / "linkup_response.json"
5
+ TAVILY_RESPONSE_FILE = DATA_DIR / "answers" / "tavily_response.json"
6
+
7
+
8
+ def load_linkup_response() -> SearchResponse:
9
+ with open(LINKUP_RESPONSE_FILE, "r") as f:
10
+ return SearchResponse.model_validate_json(f.read())
11
+
12
+ def load_tavily_response() -> SearchResponse:
13
+ with open(TAVILY_RESPONSE_FILE, "r") as f:
14
+ return SearchResponse.model_validate_json(f.read())
15
+
16
+ URL_WIKIPEDIA = "https://en.wikipedia.org/wiki/Graphite-moderated_reactor"
17
+ URL_PDF = "https://arxiv.org/pdf/1301.1699.pdf"
18
+ ARXIV_URL = "https://arxiv.org/abs/1301.1699"
19
+ PUBMED_URL = "https://pubmed.ncbi.nlm.nih.gov/34100000/"
20
+ SCIENCEDIRECT_URL = "https://www.sciencedirect.com/science/article/abs/pii/0168900289901964"
tests/webcrawler/test_async_crawl.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from deepengineer.webcrawler.async_crawl import (
3
+ crawl4ai_extract_markdown_of_url_async,
4
+ download_pdf_async,
5
+ arxiv_download_pdf_async,
6
+ )
7
+ from mistralai import OCRResponse
8
+ from deepengineer.webcrawler.pdf_tools import convert_pdf_to_markdown_async
9
+ from deepengineer.webcrawler.testing import URL_WIKIPEDIA, URL_PDF, ARXIV_URL
10
+ from deepengineer.common_path import DATA_DIR
11
+
12
+ @pytest.mark.asyncio
13
+ async def test_crawl4ai_extract_markdown_of_url_async():
14
+ markdown = await crawl4ai_extract_markdown_of_url_async(URL_WIKIPEDIA)
15
+ assert isinstance(markdown, str)
16
+ assert "Graphite-moderated reactor" in markdown
17
+
18
+ @pytest.mark.asyncio
19
+ async def test_download_pdf_async():
20
+ output_path = DATA_DIR / "temp.pdf"
21
+ output_path.unlink(missing_ok=True)
22
+ pdf_path = await download_pdf_async(URL_PDF, output_path=output_path)
23
+ assert pdf_path == output_path
24
+ assert output_path.exists()
25
+
26
+ @pytest.mark.asyncio
27
+ async def test_arxiv_download_pdf_async():
28
+ output_path = DATA_DIR / "temp.pdf"
29
+ output_path.unlink(missing_ok=True)
30
+ assert not output_path.exists()
31
+ pdf_path = await arxiv_download_pdf_async(ARXIV_URL, output_path=output_path)
32
+ assert pdf_path == output_path
33
+ assert output_path.exists()
34
+
35
+ @pytest.mark.expensive
36
+ @pytest.mark.asyncio
37
+ async def test_convert_pdf_to_markdown_async():
38
+ pdf_path = DATA_DIR / "report_thermal_neutron.pdf"
39
+ assert pdf_path.exists()
40
+ markdown, ocr_response = await convert_pdf_to_markdown_async(pdf_path)
41
+ assert isinstance(ocr_response, OCRResponse)
42
+ assert len(ocr_response.pages) == 16
43
+ assert isinstance(markdown, str)
44
+ assert "where each cylinder represent" in markdown
tests/webcrawler/{test_utils.py β†’ test_async_search.py} RENAMED
@@ -10,17 +10,17 @@ from deepengineer.webcrawler.async_search import (
10
 
11
 
12
  @pytest.mark.expensive
13
- def test_tavily_search_async():
 
14
 
15
  usage_before = get_tavily_usage()
16
  print(usage_before)
17
 
18
 
19
- response = asyncio.run(
20
- tavily_search_async(
21
- search_query="Would it be possible to make a thermal reactor with graphite and lead?",
22
- )
23
  )
 
24
  print(response.answer)
25
  assert response is not None
26
  assert isinstance(response, SearchResponse)
@@ -41,15 +41,14 @@ def test_tavily_search_async():
41
  assert usage_after == usage_before + 1
42
 
43
  @pytest.mark.expensive
44
- def test_linkup_search_async():
 
45
 
46
  balance_before = get_linkup_balance()
47
  print(balance_before)
48
 
49
- response = asyncio.run(
50
- async_linkup_search(
51
- search_query="Would it be possible to make a thermal reactor with graphite and lead?",
52
- )
53
  )
54
  print(response.answer)
55
  assert response is not None
 
10
 
11
 
12
  @pytest.mark.expensive
13
+ @pytest.mark.asyncio
14
+ async def test_tavily_search_async():
15
 
16
  usage_before = get_tavily_usage()
17
  print(usage_before)
18
 
19
 
20
+ response = await tavily_search_async(
21
+ search_query="Would it be possible to make a thermal reactor with graphite and lead?",
 
 
22
  )
23
+
24
  print(response.answer)
25
  assert response is not None
26
  assert isinstance(response, SearchResponse)
 
41
  assert usage_after == usage_before + 1
42
 
43
  @pytest.mark.expensive
44
+ @pytest.mark.asyncio
45
+ async def test_linkup_search_async():
46
 
47
  balance_before = get_linkup_balance()
48
  print(balance_before)
49
 
50
+ response = await async_linkup_search(
51
+ search_query="Would it be possible to make a thermal reactor with graphite and lead?",
 
 
52
  )
53
  print(response.answer)
54
  assert response is not None
uv.lock CHANGED
@@ -514,11 +514,15 @@ dependencies = [
514
  { name = "datasets" },
515
  { name = "fasttext-wheel" },
516
  { name = "gradio" },
 
517
  { name = "langchain" },
518
  { name = "litellm" },
 
519
  { name = "open-deep-research" },
520
  { name = "openai" },
521
  { name = "pillow" },
 
 
522
  { name = "python-dotenv" },
523
  { name = "smolagents" },
524
  { name = "transformers" },
@@ -531,11 +535,15 @@ requires-dist = [
531
  { name = "datasets" },
532
  { name = "fasttext-wheel" },
533
  { name = "gradio" },
 
534
  { name = "langchain" },
535
  { name = "litellm" },
 
536
  { name = "open-deep-research", git = "https://github.com/langchain-ai/open_deep_research" },
537
  { name = "openai" },
538
  { name = "pillow" },
 
 
539
  { name = "python-dotenv", specifier = ">=1.1.1" },
540
  { name = "smolagents", specifier = ">=1.19.0" },
541
  { name = "transformers" },
@@ -574,6 +582,15 @@ wheels = [
574
  { url = "https://files.pythonhosted.org/packages/6a/28/bd84c58623bc7a817b30ae8ba23356f8cfca8577969c15c7133ac29db1e4/duckduckgo_search-8.0.5-py3-none-any.whl", hash = "sha256:c9f18cb8f8311b9005c8f8f699216dd124b01a797d97a2df725942c1af4c16fa", size = 18228 },
575
  ]
576
 
 
 
 
 
 
 
 
 
 
577
  [[package]]
578
  name = "exa-py"
579
  version = "1.14.14"
@@ -1530,6 +1547,22 @@ wheels = [
1530
  { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
1531
  ]
1532
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1533
  [[package]]
1534
  name = "msal"
1535
  version = "1.32.3"
@@ -2116,6 +2149,15 @@ wheels = [
2116
  { url = "https://files.pythonhosted.org/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771 },
2117
  ]
2118
 
 
 
 
 
 
 
 
 
 
2119
  [[package]]
2120
  name = "pyperclip"
2121
  version = "1.9.0"
@@ -2138,6 +2180,18 @@ wheels = [
2138
  { url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474 },
2139
  ]
2140
 
 
 
 
 
 
 
 
 
 
 
 
 
2141
  [[package]]
2142
  name = "python-dateutil"
2143
  version = "2.9.0.post0"
 
514
  { name = "datasets" },
515
  { name = "fasttext-wheel" },
516
  { name = "gradio" },
517
+ { name = "httpx" },
518
  { name = "langchain" },
519
  { name = "litellm" },
520
+ { name = "mistralai" },
521
  { name = "open-deep-research" },
522
  { name = "openai" },
523
  { name = "pillow" },
524
+ { name = "pypdf" },
525
+ { name = "pytest-asyncio" },
526
  { name = "python-dotenv" },
527
  { name = "smolagents" },
528
  { name = "transformers" },
 
535
  { name = "datasets" },
536
  { name = "fasttext-wheel" },
537
  { name = "gradio" },
538
+ { name = "httpx" },
539
  { name = "langchain" },
540
  { name = "litellm" },
541
+ { name = "mistralai", specifier = ">=1.9.1" },
542
  { name = "open-deep-research", git = "https://github.com/langchain-ai/open_deep_research" },
543
  { name = "openai" },
544
  { name = "pillow" },
545
+ { name = "pypdf" },
546
+ { name = "pytest-asyncio", specifier = ">=1.0.0" },
547
  { name = "python-dotenv", specifier = ">=1.1.1" },
548
  { name = "smolagents", specifier = ">=1.19.0" },
549
  { name = "transformers" },
 
582
  { url = "https://files.pythonhosted.org/packages/6a/28/bd84c58623bc7a817b30ae8ba23356f8cfca8577969c15c7133ac29db1e4/duckduckgo_search-8.0.5-py3-none-any.whl", hash = "sha256:c9f18cb8f8311b9005c8f8f699216dd124b01a797d97a2df725942c1af4c16fa", size = 18228 },
583
  ]
584
 
585
+ [[package]]
586
+ name = "eval-type-backport"
587
+ version = "0.2.2"
588
+ source = { registry = "https://pypi.org/simple" }
589
+ sdist = { url = "https://files.pythonhosted.org/packages/30/ea/8b0ac4469d4c347c6a385ff09dc3c048c2d021696664e26c7ee6791631b5/eval_type_backport-0.2.2.tar.gz", hash = "sha256:f0576b4cf01ebb5bd358d02314d31846af5e07678387486e2c798af0e7d849c1", size = 9079 }
590
+ wheels = [
591
+ { url = "https://files.pythonhosted.org/packages/ce/31/55cd413eaccd39125368be33c46de24a1f639f2e12349b0361b4678f3915/eval_type_backport-0.2.2-py3-none-any.whl", hash = "sha256:cb6ad7c393517f476f96d456d0412ea80f0a8cf96f6892834cd9340149111b0a", size = 5830 },
592
+ ]
593
+
594
  [[package]]
595
  name = "exa-py"
596
  version = "1.14.14"
 
1547
  { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
1548
  ]
1549
 
1550
+ [[package]]
1551
+ name = "mistralai"
1552
+ version = "1.9.1"
1553
+ source = { registry = "https://pypi.org/simple" }
1554
+ dependencies = [
1555
+ { name = "eval-type-backport" },
1556
+ { name = "httpx" },
1557
+ { name = "pydantic" },
1558
+ { name = "python-dateutil" },
1559
+ { name = "typing-inspection" },
1560
+ ]
1561
+ sdist = { url = "https://files.pythonhosted.org/packages/8c/d9/e704bb05f0eb5c6726cbbb0c9c16542553b4cdb6b4ccaf45b8c033105738/mistralai-1.9.1.tar.gz", hash = "sha256:89eb1d48e9555c8289c02ddea966115eba0516355731726ea0a24eabb42f8419", size = 182308 }
1562
+ wheels = [
1563
+ { url = "https://files.pythonhosted.org/packages/10/a2/2e177165a24d978f07cf5d5841265ab399c187b0a44077d67502b8129b27/mistralai-1.9.1-py3-none-any.whl", hash = "sha256:250ec26534db6f4a4d5e6292b0801a64da2ab1f0d4c63a20d8ce27e3a427e402", size = 381773 },
1564
+ ]
1565
+
1566
  [[package]]
1567
  name = "msal"
1568
  version = "1.32.3"
 
2149
  { url = "https://files.pythonhosted.org/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771 },
2150
  ]
2151
 
2152
+ [[package]]
2153
+ name = "pypdf"
2154
+ version = "5.7.0"
2155
+ source = { registry = "https://pypi.org/simple" }
2156
+ sdist = { url = "https://files.pythonhosted.org/packages/7b/42/fbc37af367b20fa6c53da81b1780025f6046a0fac8cbf0663a17e743b033/pypdf-5.7.0.tar.gz", hash = "sha256:68c92f2e1aae878bab1150e74447f31ab3848b1c0a6f8becae9f0b1904460b6f", size = 5026120 }
2157
+ wheels = [
2158
+ { url = "https://files.pythonhosted.org/packages/73/9f/78d096ef795a813fa0e1cb9b33fa574b205f2b563d9c1e9366c854cf0364/pypdf-5.7.0-py3-none-any.whl", hash = "sha256:203379453439f5b68b7a1cd43cdf4c5f7a02b84810cefa7f93a47b350aaaba48", size = 305524 },
2159
+ ]
2160
+
2161
  [[package]]
2162
  name = "pyperclip"
2163
  version = "1.9.0"
 
2180
  { url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474 },
2181
  ]
2182
 
2183
+ [[package]]
2184
+ name = "pytest-asyncio"
2185
+ version = "1.0.0"
2186
+ source = { registry = "https://pypi.org/simple" }
2187
+ dependencies = [
2188
+ { name = "pytest" },
2189
+ ]
2190
+ sdist = { url = "https://files.pythonhosted.org/packages/d0/d4/14f53324cb1a6381bef29d698987625d80052bb33932d8e7cbf9b337b17c/pytest_asyncio-1.0.0.tar.gz", hash = "sha256:d15463d13f4456e1ead2594520216b225a16f781e144f8fdf6c5bb4667c48b3f", size = 46960 }
2191
+ wheels = [
2192
+ { url = "https://files.pythonhosted.org/packages/30/05/ce271016e351fddc8399e546f6e23761967ee09c8c568bbfbecb0c150171/pytest_asyncio-1.0.0-py3-none-any.whl", hash = "sha256:4f024da9f1ef945e680dc68610b52550e36590a67fd31bb3b4943979a1f90ef3", size = 15976 },
2193
+ ]
2194
+
2195
  [[package]]
2196
  name = "python-dateutil"
2197
  version = "2.9.0.post0"