agents-course-final-assignment

Runtime error

agents-course-final-assignment / tools /wikipedia.py

abtsousa

Add bs4 dependency and update Wikipedia tool for HTML parsing

eb3f029 3 months ago

3.32 kB

	from langchain_core.tools import tool
	import wikipediaapi
	import requests
	from bs4 import BeautifulSoup

	@tool
	def wiki_search(query: str) -> str:
	"""
	Search Wikipedia for a given query and return the full page content.

	Args:
	query (str): The search query to find relevant Wikipedia articles.
	"""
	# Initialize Wikipedia API with additional parameters for more info
	wiki = wikipediaapi.Wikipedia(
	user_agent='OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)',
	language='en',
	)

	# Get the page
	page = wiki.page(query)

	# Check if page exists
	if not page.exists():
	return f"No Wikipedia page found for '{query}'. Please try a different search term."

	# Return the full text content (summary + all sections)
	return f"Title: {page.title}\n\nURL: {page.fullurl}\n\n{page.text}"

	@tool
	def wiki_parse_html(page_title: str, section_id: int \| None = None) -> str:
	"""
	Get Wikipedia page HTML content using the parse API.
	Use only if the standard wiki_search tool returns insufficient text for a section.

	Args:
	page_title (str): The exact title of the Wikipedia page.
	section_id (int, optional): The section ID number to parse (e.g., "1" for first section).
	If None, returns the entire page.
	"""
	url = "https://en.wikipedia.org/w/api.php"
	params = {
	'action': 'parse',
	'page': page_title,
	'format': 'json',
	'prop': 'text'
	}

	# Add section parameter if provided
	if section_id is not None:
	params['section'] = str(section_id)

	headers = {
	'User-Agent': 'OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)'
	}

	try:
	response = requests.get(url, params=params, headers=headers)
	response.raise_for_status()
	data = response.json()

	if 'error' in data:
	return f"Error: {data['error']['info']}"

	if 'parse' not in data or 'text' not in data['parse']:
	return f"No content found for page '{page_title}'"

	# Raw HTML content from Wikipedia
	raw_html = data['parse']['text']['*']

	# Sanitize HTML: remove style/script tags and strip all attributes while keeping tag structure
	try:
	soup = BeautifulSoup(raw_html, 'html.parser')

	# Remove unwanted tags entirely
	for tag in soup(['style', 'script']):
	tag.decompose()

	# Strip attributes from all remaining tags (e.g., <div class=".." id=".."> -> <div>)
	from bs4.element import Tag as _Tag
	for tag in soup.find_all(True):
	if isinstance(tag, _Tag):
	tag.attrs.clear()

	# Optional: collapse excessive whitespace
	text = str(soup)
	return text
	except Exception as e:
	# Fallback to raw HTML if sanitization fails
	return raw_html

	except requests.RequestException as e:
	return f"Error fetching page: {str(e)}"
	except Exception as e:
	return f"Error parsing response: {str(e)}"

	if __name__ == "__main__":
	query = "Malko Competition"
	result = wiki_parse_html(query)
	print(result)