Spaces:

axel-darmouni
/

datagouv-french-data-analyst

Sleeping

App Files Files Community

datagouv-french-data-analyst / agent.py

axel-darmouni

all modifs

f584ef2 3 months ago

raw

history blame

5.84 kB

	import os
	from tools.webpage_tools import (
	visit_webpage,
	get_all_links,
	read_file_from_url,
	save_dataset_for_followup,
	)
	from tools.exploration_tools import (
	get_dataset_description,
	)
	from tools.drawing_tools import (
	plot_departments_data,
	)
	from tools.libreoffice_tools import (
	convert_to_pdf_with_libreoffice,
	check_libreoffice_availability,
	get_libreoffice_info,
	)
	from tools.retrieval_tools import (
	search_datasets,
	get_dataset_info,
	get_random_quality_dataset,
	)
	from smolagents import (
	CodeAgent,
	DuckDuckGoSearchTool,
	LiteLLMModel,
	)

	def create_web_agent(step_callback):
	search_tool = DuckDuckGoSearchTool()
	model = LiteLLMModel(
	model_id="gemini/gemini-2.5-flash-preview-05-20",
	api_key=os.getenv("GEMINI_API_KEY"),
	)
	web_agent = CodeAgent(
	tools=[
	search_tool,
	visit_webpage, get_all_links, read_file_from_url, save_dataset_for_followup,
	get_dataset_description,
	plot_departments_data,
	convert_to_pdf_with_libreoffice,
	check_libreoffice_availability, get_libreoffice_info,
	search_datasets, get_dataset_info, get_random_quality_dataset
	],
	model=model,
	max_steps=30,
	verbosity_level=1, # Reduced verbosity for cleaner output
	planning_interval=3,
	step_callbacks=[step_callback], # Use the built-in callback system
	additional_authorized_imports=[
	"subprocess", "docx", "docx.*",
	"os", "bs4", "io", "requests", "json", "pandas",
	"matplotlib", "matplotlib.pyplot", "matplotlib.*", "numpy", "seaborn"
	],
	)
	return web_agent

	def generate_prompt(user_query=None, initial_search_results=None):
	"""Generate a unified prompt for dataset search and analysis"""

	base_instructions = """Follow these steps to analyze French public data:

	1. Dataset Selection:
	- You can use the search_datasets tool to find relevant datasets
	- You can use get_dataset_info to get detailed information about specific datasets
	- You can use get_random_quality_dataset to explore interesting datasets

	2. Dataset Analysis:
	- Examine the selected dataset page using visit_webpage
	- Get all available data links using get_all_links
	- Download and analyze the dataset using read_file_from_url
	- Save the dataset for follow-up analysis using save_dataset_for_followup
	- Get dataset description using get_dataset_description

	3. Visualization Creation:
	- If geographic data (departments/regions) is available, create a map of France
	- Create 3 additional non-map visualizations
	- Save all visualizations as PNG files

	4. Report Generation:
	- Write insightful analysis text for each visualization
	- Generate a comprehensive PDF report using python-docx library that includes:
	* Title page with dataset name and analysis overview
	* All visualizations (PNG files) embedded in the report
	* Analysis text for each visualization
	* Conclusions and next steps
	- Convert the docx file to PDF using convert_to_pdf_with_libreoffice tool

	Important Technical Notes:
	- Save everything in the generated_data folder
	- Do NOT use the 'os' module
	- Work step by step, don't generate too much code at once
	- Before PDF conversion, call check_libreoffice_availability() - it returns True/False
	- If check_libreoffice_availability() returns True, use convert_to_pdf_with_libreoffice() tool
	- If check_libreoffice_availability() returns False, skip PDF conversion and inform user
	- Do NOT use subprocess calls directly for LibreOffice
	- If question is in English, report is in English. If in French, report is in French.
	"""

	if user_query and initial_search_results:
	return f"""I need you to analyze French public datasets related to: "{user_query}"

	INITIAL SEARCH RESULTS:
	{initial_search_results}

	You have these options:
	1. Use one of the datasets from the initial search results above - select the most relevant one
	2. Search for different datasets using the search_datasets tool if none of the above seem perfect
	3. Get more information about any dataset using get_dataset_info tool

	{base_instructions}

	Focus your analysis on insights related to "{user_query}". Choose the most relevant dataset and create meaningful visualizations that answer questions about "{user_query}".
	If user query is not specific, remain generic with respect to the dataset at hand.
	Focus on getting results and analytics; do not go with too much data, we can always improve it later.
	"""

	elif user_query:
	return f"""I need you to find and analyze French public datasets related to: "{user_query}"

	{base_instructions}

	Start by using the search_datasets tool to find relevant datasets related to "{user_query}". Focus your analysis on insights related to "{user_query}".
	If user query is not specific, remain generic with respect to the dataset at hand.
	Focus on getting results and analytics; do not go with too much data, we can always improve it later.
	"""

	else:
	return f"""I need you to find and analyze an interesting French public dataset.

	{base_instructions}

	Start by using the search_datasets tool to find interesting datasets, or use get_random_quality_dataset to explore a high-quality dataset.
	If user query is not specific, remain generic with respect to the dataset at hand.
	Focus on getting results and analytics; do not go with too much data, we can always improve it later.
	"""