File size: 3,322 Bytes
3adfe4f
92f38fd
eb3f029
 
3adfe4f
 
a40ea82
3adfe4f
92f38fd
3adfe4f
a40ea82
 
3adfe4f
eb3f029
 
92f38fd
 
 
 
 
eb3f029
92f38fd
 
 
 
 
 
 
eb3f029
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from langchain_core.tools import tool
import wikipediaapi
import requests
from bs4 import BeautifulSoup

@tool
def wiki_search(query: str) -> str:
    """
    Search Wikipedia for a given query and return the full page content.

    Args:
        query (str): The search query to find relevant Wikipedia articles.
    """
    # Initialize Wikipedia API with additional parameters for more info
    wiki = wikipediaapi.Wikipedia(
        user_agent='OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)',
        language='en',
    )
    
    # Get the page
    page = wiki.page(query)
    
    # Check if page exists
    if not page.exists():
        return f"No Wikipedia page found for '{query}'. Please try a different search term."
    
    # Return the full text content (summary + all sections)
    return f"Title: {page.title}\n\nURL: {page.fullurl}\n\n{page.text}"

@tool
def wiki_parse_html(page_title: str, section_id: int | None = None) -> str:
    """
    Get Wikipedia page HTML content using the parse API.
    Use only if the standard wiki_search tool returns insufficient text for a section.

    Args:
        page_title (str): The exact title of the Wikipedia page.
        section_id (int, optional): The section ID number to parse (e.g., "1" for first section). 
                                   If None, returns the entire page.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'parse',
        'page': page_title,
        'format': 'json',
        'prop': 'text'
    }
    
    # Add section parameter if provided
    if section_id is not None:
        params['section'] = str(section_id)
    
    headers = {
        'User-Agent': 'OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)'
    }
    
    try:
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        if 'error' in data:
            return f"Error: {data['error']['info']}"
        
        if 'parse' not in data or 'text' not in data['parse']:
            return f"No content found for page '{page_title}'"
        
        # Raw HTML content from Wikipedia
        raw_html = data['parse']['text']['*']

        # Sanitize HTML: remove style/script tags and strip all attributes while keeping tag structure
        try:
            soup = BeautifulSoup(raw_html, 'html.parser')

            # Remove unwanted tags entirely
            for tag in soup(['style', 'script']):
                tag.decompose()

            # Strip attributes from all remaining tags (e.g., <div class=".." id=".."> -> <div>)
            from bs4.element import Tag as _Tag
            for tag in soup.find_all(True):
                if isinstance(tag, _Tag):
                    tag.attrs.clear()

            # Optional: collapse excessive whitespace
            text = str(soup)
            return text
        except Exception as e:
            # Fallback to raw HTML if sanitization fails
            return raw_html
        
    except requests.RequestException as e:
        return f"Error fetching page: {str(e)}"
    except Exception as e:
        return f"Error parsing response: {str(e)}"

if __name__ == "__main__":
    query = "Malko Competition"
    result = wiki_parse_html(query)
    print(result)