abtsousa commited on
Commit
eb3f029
·
1 Parent(s): e8c805a

Add bs4 dependency and update Wikipedia tool for HTML parsing

Browse files
Files changed (3) hide show
  1. pyproject.toml +1 -0
  2. tools/wikipedia.py +77 -3
  3. uv.lock +14 -0
pyproject.toml CHANGED
@@ -6,6 +6,7 @@ readme = "README.md"
6
  requires-python = ">=3.13"
7
  dependencies = [
8
  "arize-phoenix-otel>=0.12.0",
 
9
  "gradio[oauth]>=5.42.0",
10
  "grandalf>=0.8",
11
  "ipykernel>=6.30.1",
 
6
  requires-python = ">=3.13"
7
  dependencies = [
8
  "arize-phoenix-otel>=0.12.0",
9
+ "bs4>=0.0.2",
10
  "gradio[oauth]>=5.42.0",
11
  "grandalf>=0.8",
12
  "ipykernel>=6.30.1",
tools/wikipedia.py CHANGED
@@ -1,5 +1,7 @@
1
  from langchain_core.tools import tool
2
  import wikipediaapi
 
 
3
 
4
  @tool
5
  def wiki_search(query: str) -> str:
@@ -9,14 +11,14 @@ def wiki_search(query: str) -> str:
9
  Args:
10
  query (str): The search query to find relevant Wikipedia articles.
11
  """
12
- # Initialize Wikipedia API with a proper user agent
13
- wiki_wiki = wikipediaapi.Wikipedia(
14
  user_agent='OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)',
15
  language='en',
16
  )
17
 
18
  # Get the page
19
- page = wiki_wiki.page(query)
20
 
21
  # Check if page exists
22
  if not page.exists():
@@ -24,3 +26,75 @@ def wiki_search(query: str) -> str:
24
 
25
  # Return the full text content (summary + all sections)
26
  return f"Title: {page.title}\n\nURL: {page.fullurl}\n\n{page.text}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from langchain_core.tools import tool
2
  import wikipediaapi
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
 
6
  @tool
7
  def wiki_search(query: str) -> str:
 
11
  Args:
12
  query (str): The search query to find relevant Wikipedia articles.
13
  """
14
+ # Initialize Wikipedia API with additional parameters for more info
15
+ wiki = wikipediaapi.Wikipedia(
16
  user_agent='OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)',
17
  language='en',
18
  )
19
 
20
  # Get the page
21
+ page = wiki.page(query)
22
 
23
  # Check if page exists
24
  if not page.exists():
 
26
 
27
  # Return the full text content (summary + all sections)
28
  return f"Title: {page.title}\n\nURL: {page.fullurl}\n\n{page.text}"
29
+
30
+ @tool
31
+ def wiki_parse_html(page_title: str, section_id: int | None = None) -> str:
32
+ """
33
+ Get Wikipedia page HTML content using the parse API.
34
+ Use only if the standard wiki_search tool returns insufficient text for a section.
35
+
36
+ Args:
37
+ page_title (str): The exact title of the Wikipedia page.
38
+ section_id (int, optional): The section ID number to parse (e.g., "1" for first section).
39
+ If None, returns the entire page.
40
+ """
41
+ url = "https://en.wikipedia.org/w/api.php"
42
+ params = {
43
+ 'action': 'parse',
44
+ 'page': page_title,
45
+ 'format': 'json',
46
+ 'prop': 'text'
47
+ }
48
+
49
+ # Add section parameter if provided
50
+ if section_id is not None:
51
+ params['section'] = str(section_id)
52
+
53
+ headers = {
54
+ 'User-Agent': 'OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)'
55
+ }
56
+
57
+ try:
58
+ response = requests.get(url, params=params, headers=headers)
59
+ response.raise_for_status()
60
+ data = response.json()
61
+
62
+ if 'error' in data:
63
+ return f"Error: {data['error']['info']}"
64
+
65
+ if 'parse' not in data or 'text' not in data['parse']:
66
+ return f"No content found for page '{page_title}'"
67
+
68
+ # Raw HTML content from Wikipedia
69
+ raw_html = data['parse']['text']['*']
70
+
71
+ # Sanitize HTML: remove style/script tags and strip all attributes while keeping tag structure
72
+ try:
73
+ soup = BeautifulSoup(raw_html, 'html.parser')
74
+
75
+ # Remove unwanted tags entirely
76
+ for tag in soup(['style', 'script']):
77
+ tag.decompose()
78
+
79
+ # Strip attributes from all remaining tags (e.g., <div class=".." id=".."> -> <div>)
80
+ from bs4.element import Tag as _Tag
81
+ for tag in soup.find_all(True):
82
+ if isinstance(tag, _Tag):
83
+ tag.attrs.clear()
84
+
85
+ # Optional: collapse excessive whitespace
86
+ text = str(soup)
87
+ return text
88
+ except Exception as e:
89
+ # Fallback to raw HTML if sanitization fails
90
+ return raw_html
91
+
92
+ except requests.RequestException as e:
93
+ return f"Error fetching page: {str(e)}"
94
+ except Exception as e:
95
+ return f"Error parsing response: {str(e)}"
96
+
97
+ if __name__ == "__main__":
98
+ query = "Malko Competition"
99
+ result = wiki_parse_html(query)
100
+ print(result)
uv.lock CHANGED
@@ -235,6 +235,18 @@ wheels = [
235
  { url = "https://files.pythonhosted.org/packages/7e/c1/ec214e9c94000d1c1974ec67ced1c970c148aa6b8d8373066123fc3dbf06/Brotli-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9011560a466d2eb3f5a6e4929cf4a09be405c64154e12df0dd72713f6500e32b", size = 358517, upload-time = "2024-10-18T12:32:54.066Z" },
236
  ]
237
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  [[package]]
239
  name = "cachetools"
240
  version = "5.5.2"
@@ -1802,6 +1814,7 @@ version = "0.1.0"
1802
  source = { virtual = "." }
1803
  dependencies = [
1804
  { name = "arize-phoenix-otel" },
 
1805
  { name = "gradio", extra = ["oauth"] },
1806
  { name = "grandalf" },
1807
  { name = "ipykernel" },
@@ -1827,6 +1840,7 @@ dependencies = [
1827
  [package.metadata]
1828
  requires-dist = [
1829
  { name = "arize-phoenix-otel", specifier = ">=0.12.0" },
 
1830
  { name = "gradio", extras = ["oauth"], specifier = ">=5.42.0" },
1831
  { name = "grandalf", specifier = ">=0.8" },
1832
  { name = "ipykernel", specifier = ">=6.30.1" },
 
235
  { url = "https://files.pythonhosted.org/packages/7e/c1/ec214e9c94000d1c1974ec67ced1c970c148aa6b8d8373066123fc3dbf06/Brotli-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9011560a466d2eb3f5a6e4929cf4a09be405c64154e12df0dd72713f6500e32b", size = 358517, upload-time = "2024-10-18T12:32:54.066Z" },
236
  ]
237
 
238
+ [[package]]
239
+ name = "bs4"
240
+ version = "0.0.2"
241
+ source = { registry = "https://pypi.org/simple" }
242
+ dependencies = [
243
+ { name = "beautifulsoup4" },
244
+ ]
245
+ sdist = { url = "https://files.pythonhosted.org/packages/c9/aa/4acaf814ff901145da37332e05bb510452ebed97bc9602695059dd46ef39/bs4-0.0.2.tar.gz", hash = "sha256:a48685c58f50fe127722417bae83fe6badf500d54b55f7e39ffe43b798653925", size = 698, upload-time = "2024-01-17T18:15:47.371Z" }
246
+ wheels = [
247
+ { url = "https://files.pythonhosted.org/packages/51/bb/bf7aab772a159614954d84aa832c129624ba6c32faa559dfb200a534e50b/bs4-0.0.2-py2.py3-none-any.whl", hash = "sha256:abf8742c0805ef7f662dce4b51cca104cffe52b835238afc169142ab9b3fbccc", size = 1189, upload-time = "2024-01-17T18:15:48.613Z" },
248
+ ]
249
+
250
  [[package]]
251
  name = "cachetools"
252
  version = "5.5.2"
 
1814
  source = { virtual = "." }
1815
  dependencies = [
1816
  { name = "arize-phoenix-otel" },
1817
+ { name = "bs4" },
1818
  { name = "gradio", extra = ["oauth"] },
1819
  { name = "grandalf" },
1820
  { name = "ipykernel" },
 
1840
  [package.metadata]
1841
  requires-dist = [
1842
  { name = "arize-phoenix-otel", specifier = ">=0.12.0" },
1843
+ { name = "bs4", specifier = ">=0.0.2" },
1844
  { name = "gradio", extras = ["oauth"], specifier = ">=5.42.0" },
1845
  { name = "grandalf", specifier = ">=0.8" },
1846
  { name = "ipykernel", specifier = ">=6.30.1" },