Spaces:
Runtime error
Runtime error
| from langchain_core.tools import tool | |
| import wikipediaapi | |
| import requests | |
| from bs4 import BeautifulSoup | |
| def wiki_search(query: str) -> str: | |
| """ | |
| Search Wikipedia for a given query and return the full page content. | |
| Args: | |
| query (str): The search query to find relevant Wikipedia articles. | |
| """ | |
| # Initialize Wikipedia API with additional parameters for more info | |
| wiki = wikipediaapi.Wikipedia( | |
| user_agent='OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)', | |
| language='en', | |
| ) | |
| # Get the page | |
| page = wiki.page(query) | |
| # Check if page exists | |
| if not page.exists(): | |
| return f"No Wikipedia page found for '{query}'. Please try a different search term." | |
| # Return the full text content (summary + all sections) | |
| return f"Title: {page.title}\n\nURL: {page.fullurl}\n\n{page.text}" | |
| def wiki_parse_html(page_title: str, section_id: int | None = None) -> str: | |
| """ | |
| Get Wikipedia page HTML content using the parse API. | |
| Use only if the standard wiki_search tool returns insufficient text for a section. | |
| Args: | |
| page_title (str): The exact title of the Wikipedia page. | |
| section_id (int, optional): The section ID number to parse (e.g., "1" for first section). | |
| If None, returns the entire page. | |
| """ | |
| url = "https://en.wikipedia.org/w/api.php" | |
| params = { | |
| 'action': 'parse', | |
| 'page': page_title, | |
| 'format': 'json', | |
| 'prop': 'text' | |
| } | |
| # Add section parameter if provided | |
| if section_id is not None: | |
| params['section'] = str(section_id) | |
| headers = { | |
| 'User-Agent': 'OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)' | |
| } | |
| try: | |
| response = requests.get(url, params=params, headers=headers) | |
| response.raise_for_status() | |
| data = response.json() | |
| if 'error' in data: | |
| return f"Error: {data['error']['info']}" | |
| if 'parse' not in data or 'text' not in data['parse']: | |
| return f"No content found for page '{page_title}'" | |
| # Raw HTML content from Wikipedia | |
| raw_html = data['parse']['text']['*'] | |
| # Sanitize HTML: remove style/script tags and strip all attributes while keeping tag structure | |
| try: | |
| soup = BeautifulSoup(raw_html, 'html.parser') | |
| # Remove unwanted tags entirely | |
| for tag in soup(['style', 'script']): | |
| tag.decompose() | |
| # Strip attributes from all remaining tags (e.g., <div class=".." id=".."> -> <div>) | |
| from bs4.element import Tag as _Tag | |
| for tag in soup.find_all(True): | |
| if isinstance(tag, _Tag): | |
| tag.attrs.clear() | |
| # Optional: collapse excessive whitespace | |
| text = str(soup) | |
| return text | |
| except Exception as e: | |
| # Fallback to raw HTML if sanitization fails | |
| return raw_html | |
| except requests.RequestException as e: | |
| return f"Error fetching page: {str(e)}" | |
| except Exception as e: | |
| return f"Error parsing response: {str(e)}" | |
| if __name__ == "__main__": | |
| query = "Malko Competition" | |
| result = wiki_parse_html(query) | |
| print(result) | |