Spaces:
Configuration error
Configuration error
from dotenv import load_dotenv | |
load_dotenv() | |
import os | |
from loguru import logger | |
from camel.models import ModelFactory | |
from camel.toolkits import ( | |
AudioAnalysisToolkit, | |
CodeExecutionToolkit, | |
DocumentProcessingToolkit, | |
ExcelToolkit, | |
ImageAnalysisToolkit, | |
SearchToolkit, | |
VideoAnalysisToolkit, | |
WebToolkit, | |
) | |
from camel.types import ModelPlatformType, ModelType | |
from camel.configs import ChatGPTConfig | |
from utils import GAIABenchmark | |
# Configuration | |
LEVEL = 1 | |
SAVE_RESULT = True | |
test_idx = [0] | |
def main(): | |
"""Main function to run the GAIA benchmark.""" | |
# Create cache directory | |
cache_dir = "tmp/" | |
os.makedirs(cache_dir, exist_ok=True) | |
# Create models for different components | |
models = { | |
"user": ModelFactory.create( | |
model_platform=ModelPlatformType.OPENAI, | |
model_type=ModelType.GPT_4O, | |
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), | |
), | |
"assistant": ModelFactory.create( | |
model_platform=ModelPlatformType.OPENAI, | |
model_type=ModelType.GPT_4O, | |
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), | |
), | |
"web": ModelFactory.create( | |
model_platform=ModelPlatformType.OPENAI, | |
model_type=ModelType.GPT_4O, | |
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), | |
), | |
"planning": ModelFactory.create( | |
model_platform=ModelPlatformType.OPENAI, | |
model_type=ModelType.GPT_4O, | |
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), | |
), | |
"video": ModelFactory.create( | |
model_platform=ModelPlatformType.OPENAI, | |
model_type=ModelType.GPT_4O, | |
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), | |
), | |
"image": ModelFactory.create( | |
model_platform=ModelPlatformType.OPENAI, | |
model_type=ModelType.GPT_4O, | |
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), | |
), | |
"search": ModelFactory.create( | |
model_platform=ModelPlatformType.OPENAI, | |
model_type=ModelType.GPT_4O, | |
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), | |
), | |
} | |
# Configure toolkits | |
tools = [ | |
*WebToolkit( | |
headless=False, # Set to True for headless mode (e.g., on remote servers) | |
web_agent_model=models["web"], | |
planning_agent_model=models["planning"], | |
).get_tools(), | |
*DocumentProcessingToolkit().get_tools(), | |
*VideoAnalysisToolkit(model=models["video"]).get_tools(), # This requires OpenAI Key | |
*AudioAnalysisToolkit().get_tools(), # This requires OpenAI Key | |
*CodeExecutionToolkit(sandbox="subprocess", verbose=True).get_tools(), | |
*ImageAnalysisToolkit(model=models["image"]).get_tools(), | |
*SearchToolkit(model=models["search"]).get_tools(), | |
*ExcelToolkit().get_tools(), | |
] | |
# Configure agent roles and parameters | |
user_agent_kwargs = {"model": models["user"]} | |
assistant_agent_kwargs = {"model": models["assistant"], "tools": tools} | |
# Initialize benchmark | |
benchmark = GAIABenchmark( | |
data_dir="data/gaia", | |
save_to=f"results/result.json" | |
) | |
# Print benchmark information | |
print(f"Number of validation examples: {len(benchmark.valid)}") | |
print(f"Number of test examples: {len(benchmark.test)}") | |
# Run benchmark | |
result = benchmark.run( | |
on="valid", | |
level=LEVEL, | |
idx=test_idx, | |
save_result=SAVE_RESULT, | |
user_role_name="user", | |
user_agent_kwargs=user_agent_kwargs, | |
assistant_role_name="assistant", | |
assistant_agent_kwargs=assistant_agent_kwargs, | |
) | |
# Output results | |
logger.success(f"Correct: {result['correct']}, Total: {result['total']}") | |
logger.success(f"Accuracy: {result['accuracy']}") | |
if __name__ == "__main__": | |
main() | |