Spaces:
Runtime error
Runtime error
File size: 3,322 Bytes
3adfe4f 92f38fd eb3f029 3adfe4f a40ea82 3adfe4f 92f38fd 3adfe4f a40ea82 3adfe4f eb3f029 92f38fd eb3f029 92f38fd eb3f029 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
from langchain_core.tools import tool
import wikipediaapi
import requests
from bs4 import BeautifulSoup
@tool
def wiki_search(query: str) -> str:
"""
Search Wikipedia for a given query and return the full page content.
Args:
query (str): The search query to find relevant Wikipedia articles.
"""
# Initialize Wikipedia API with additional parameters for more info
wiki = wikipediaapi.Wikipedia(
user_agent='OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)',
language='en',
)
# Get the page
page = wiki.page(query)
# Check if page exists
if not page.exists():
return f"No Wikipedia page found for '{query}'. Please try a different search term."
# Return the full text content (summary + all sections)
return f"Title: {page.title}\n\nURL: {page.fullurl}\n\n{page.text}"
@tool
def wiki_parse_html(page_title: str, section_id: int | None = None) -> str:
"""
Get Wikipedia page HTML content using the parse API.
Use only if the standard wiki_search tool returns insufficient text for a section.
Args:
page_title (str): The exact title of the Wikipedia page.
section_id (int, optional): The section ID number to parse (e.g., "1" for first section).
If None, returns the entire page.
"""
url = "https://en.wikipedia.org/w/api.php"
params = {
'action': 'parse',
'page': page_title,
'format': 'json',
'prop': 'text'
}
# Add section parameter if provided
if section_id is not None:
params['section'] = str(section_id)
headers = {
'User-Agent': 'OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)'
}
try:
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
data = response.json()
if 'error' in data:
return f"Error: {data['error']['info']}"
if 'parse' not in data or 'text' not in data['parse']:
return f"No content found for page '{page_title}'"
# Raw HTML content from Wikipedia
raw_html = data['parse']['text']['*']
# Sanitize HTML: remove style/script tags and strip all attributes while keeping tag structure
try:
soup = BeautifulSoup(raw_html, 'html.parser')
# Remove unwanted tags entirely
for tag in soup(['style', 'script']):
tag.decompose()
# Strip attributes from all remaining tags (e.g., <div class=".." id=".."> -> <div>)
from bs4.element import Tag as _Tag
for tag in soup.find_all(True):
if isinstance(tag, _Tag):
tag.attrs.clear()
# Optional: collapse excessive whitespace
text = str(soup)
return text
except Exception as e:
# Fallback to raw HTML if sanitization fails
return raw_html
except requests.RequestException as e:
return f"Error fetching page: {str(e)}"
except Exception as e:
return f"Error parsing response: {str(e)}"
if __name__ == "__main__":
query = "Malko Competition"
result = wiki_parse_html(query)
print(result)
|