abtsousa
Add bs4 dependency and update Wikipedia tool for HTML parsing
eb3f029
raw
history blame
3.32 kB
from langchain_core.tools import tool
import wikipediaapi
import requests
from bs4 import BeautifulSoup
@tool
def wiki_search(query: str) -> str:
"""
Search Wikipedia for a given query and return the full page content.
Args:
query (str): The search query to find relevant Wikipedia articles.
"""
# Initialize Wikipedia API with additional parameters for more info
wiki = wikipediaapi.Wikipedia(
user_agent='OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)',
language='en',
)
# Get the page
page = wiki.page(query)
# Check if page exists
if not page.exists():
return f"No Wikipedia page found for '{query}'. Please try a different search term."
# Return the full text content (summary + all sections)
return f"Title: {page.title}\n\nURL: {page.fullurl}\n\n{page.text}"
@tool
def wiki_parse_html(page_title: str, section_id: int | None = None) -> str:
"""
Get Wikipedia page HTML content using the parse API.
Use only if the standard wiki_search tool returns insufficient text for a section.
Args:
page_title (str): The exact title of the Wikipedia page.
section_id (int, optional): The section ID number to parse (e.g., "1" for first section).
If None, returns the entire page.
"""
url = "https://en.wikipedia.org/w/api.php"
params = {
'action': 'parse',
'page': page_title,
'format': 'json',
'prop': 'text'
}
# Add section parameter if provided
if section_id is not None:
params['section'] = str(section_id)
headers = {
'User-Agent': 'OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)'
}
try:
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
data = response.json()
if 'error' in data:
return f"Error: {data['error']['info']}"
if 'parse' not in data or 'text' not in data['parse']:
return f"No content found for page '{page_title}'"
# Raw HTML content from Wikipedia
raw_html = data['parse']['text']['*']
# Sanitize HTML: remove style/script tags and strip all attributes while keeping tag structure
try:
soup = BeautifulSoup(raw_html, 'html.parser')
# Remove unwanted tags entirely
for tag in soup(['style', 'script']):
tag.decompose()
# Strip attributes from all remaining tags (e.g., <div class=".." id=".."> -> <div>)
from bs4.element import Tag as _Tag
for tag in soup.find_all(True):
if isinstance(tag, _Tag):
tag.attrs.clear()
# Optional: collapse excessive whitespace
text = str(soup)
return text
except Exception as e:
# Fallback to raw HTML if sanitization fails
return raw_html
except requests.RequestException as e:
return f"Error fetching page: {str(e)}"
except Exception as e:
return f"Error parsing response: {str(e)}"
if __name__ == "__main__":
query = "Malko Competition"
result = wiki_parse_html(query)
print(result)