Spaces:
Build error
Build error
Commit
·
0821095
0
Parent(s):
first commit
Browse files- .env.example +15 -0
- .gitignore +3 -0
- Dockerfile +83 -0
- README.md +90 -0
- data/.DS_Store +0 -0
- data/best.backup.json +185 -0
- data/best_model_for_category_list.json +190 -0
- data/best_model_for_results.json.lock +0 -0
- experiments/simple_smolagent.py +31 -0
- experiments/smolagent_parser.py +221 -0
- experiments/vision_web_browser.py +210 -0
- main.py +87 -0
- poetry.lock +0 -0
- pyproject.toml +27 -0
- scripts/test_agent.py +121 -0
- src/__pycache__/agent.cpython-310.pyc +0 -0
- src/__pycache__/browser.cpython-310.pyc +0 -0
- src/__pycache__/browser_utils.cpython-310.pyc +0 -0
- src/__pycache__/file_utils.cpython-310.pyc +0 -0
- src/__pycache__/hub_utils.cpython-310.pyc +0 -0
- src/__pycache__/leaderboard_processor.cpython-310.pyc +0 -0
- src/__pycache__/processor.cpython-310.pyc +0 -0
- src/__pycache__/scheduler.cpython-310.pyc +0 -0
- src/__pycache__/server.cpython-310.pyc +0 -0
- src/__pycache__/tools.cpython-310.pyc +0 -0
- src/agents/__pycache__/__init__.cpython-310.pyc +0 -0
- src/agents/__pycache__/agent.cpython-310.pyc +0 -0
- src/agents/__pycache__/agent_core.cpython-310.pyc +0 -0
- src/agents/__pycache__/agent_instructions.cpython-310.pyc +0 -0
- src/agents/__pycache__/agent_processor.cpython-310.pyc +0 -0
- src/agents/__pycache__/agent_tools.cpython-310.pyc +0 -0
- src/agents/__pycache__/browser.cpython-310.pyc +0 -0
- src/agents/__pycache__/prompts.cpython-310.pyc +0 -0
- src/agents/__pycache__/tools.cpython-310.pyc +0 -0
- src/agents/__pycache__/validators.cpython-310.pyc +0 -0
- src/agents/browser.py +148 -0
- src/agents/fact_checker/fact_checker_agent.py +3 -0
- src/agents/parser/__pycache__/agent.cpython-310.pyc +0 -0
- src/agents/parser/__pycache__/parser_agent.cpython-310.pyc +0 -0
- src/agents/parser/parser_agent.py +362 -0
- src/agents/tools.py +443 -0
- src/file_utils.py +316 -0
- src/hub_utils.py +175 -0
- src/leaderboard_processor.py +158 -0
- src/processor.py +414 -0
- src/scheduler.py +99 -0
- src/server.py +75 -0
.env.example
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Hugging Face Hub token (required)
|
2 |
+
# Create one at https://huggingface.co/settings/tokens
|
3 |
+
HUGGING_FACE_HUB_TOKEN=your_token_here
|
4 |
+
|
5 |
+
OPENAI_API_KEY=sk-proj-xxxx
|
6 |
+
|
7 |
+
# Repository ID for storing leaderboard data (required)
|
8 |
+
# Format: username/repo-name
|
9 |
+
HUGGING_FACE_STORAGE_REPO=username/leaderboard-data
|
10 |
+
|
11 |
+
# Intervalle de temps en heures avant de retraiter un leaderboard déjà analysé
|
12 |
+
LEADERBOARD_REPROCESS_INTERVAL_HOURS=24
|
13 |
+
|
14 |
+
# Nombre maximum de tentatives de traitement d'un leaderboard avant de considérer qu'il a échoué
|
15 |
+
LEADERBOARD_MAX_RETRIES=3
|
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
data/best_model_for_results.json
|
3 |
+
data/final_leaderboards.json
|
Dockerfile
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10-slim
|
2 |
+
WORKDIR /app
|
3 |
+
|
4 |
+
# Create non-root user
|
5 |
+
RUN useradd -m -u 1000 user
|
6 |
+
|
7 |
+
# Install system dependencies
|
8 |
+
RUN apt-get update && apt-get install -y \
|
9 |
+
netcat-openbsd \
|
10 |
+
wget \
|
11 |
+
gnupg \
|
12 |
+
curl \
|
13 |
+
libnss3 \
|
14 |
+
libnspr4 \
|
15 |
+
libatk1.0-0 \
|
16 |
+
libatk-bridge2.0-0 \
|
17 |
+
libcups2 \
|
18 |
+
libdrm2 \
|
19 |
+
libdbus-1-3 \
|
20 |
+
libxkbcommon0 \
|
21 |
+
libx11-6 \
|
22 |
+
libxcomposite1 \
|
23 |
+
libxdamage1 \
|
24 |
+
libxext6 \
|
25 |
+
libxfixes3 \
|
26 |
+
libxrandr2 \
|
27 |
+
libgbm1 \
|
28 |
+
libpango-1.0-0 \
|
29 |
+
libcairo2 \
|
30 |
+
libasound2 \
|
31 |
+
libatspi2.0-0 \
|
32 |
+
unzip \
|
33 |
+
xvfb \
|
34 |
+
libglib2.0-0 \
|
35 |
+
&& pip install --upgrade pip \
|
36 |
+
&& pip install poetry
|
37 |
+
|
38 |
+
# Install Chrome - required for Helium
|
39 |
+
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
40 |
+
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list \
|
41 |
+
&& apt-get update \
|
42 |
+
&& apt-get install -y google-chrome-stable \
|
43 |
+
&& rm -rf /var/lib/apt/lists/*
|
44 |
+
|
45 |
+
# Configure environment variables for Chrome
|
46 |
+
ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver \
|
47 |
+
CHROME_PATH=/usr/bin/google-chrome-stable \
|
48 |
+
CHROME_BIN=/usr/bin/google-chrome-stable
|
49 |
+
|
50 |
+
# Copy application files
|
51 |
+
COPY . /app/
|
52 |
+
|
53 |
+
# Install Python dependencies
|
54 |
+
RUN poetry config virtualenvs.create false \
|
55 |
+
&& poetry install --no-interaction --no-ansi
|
56 |
+
|
57 |
+
# Environment variables
|
58 |
+
ENV API_HOST=0.0.0.0 \
|
59 |
+
API_PORT=7860 \
|
60 |
+
PYTHONPATH=/app \
|
61 |
+
DISPLAY=:99 \
|
62 |
+
PYTHONUNBUFFERED=1 \
|
63 |
+
SELENIUM_DRIVER_EXECUTABLE_PATH=/usr/bin/chromedriver \
|
64 |
+
LEADERBOARD_REPROCESS_INTERVAL_HOURS=24 \
|
65 |
+
HOME=/home/user
|
66 |
+
|
67 |
+
# Create cache directory and set permissions
|
68 |
+
RUN mkdir -p /app/cache /home/user/.cache && chown -R user:user /app/cache /app/ /home/user/.cache
|
69 |
+
|
70 |
+
# Install additional fonts
|
71 |
+
RUN apt-get update && apt-get install -y \
|
72 |
+
fonts-noto-color-emoji \
|
73 |
+
fonts-freefont-ttf \
|
74 |
+
libharfbuzz-icu0 \
|
75 |
+
&& rm -rf /var/lib/apt/lists/*
|
76 |
+
|
77 |
+
# Switch to non-root user
|
78 |
+
USER user
|
79 |
+
|
80 |
+
EXPOSE 7860
|
81 |
+
|
82 |
+
# Start the server in server mode
|
83 |
+
CMD ["python", "main.py", "--server", "--retry-rejected"]
|
README.md
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Leaderboard Parser
|
2 |
+
|
3 |
+
Un outil pour extraire automatiquement les données des leaderboards Hugging Face à l'aide d'agents IA.
|
4 |
+
|
5 |
+
## Structure du projet
|
6 |
+
|
7 |
+
```
|
8 |
+
leaderboard-parser/
|
9 |
+
├── main.py # Point d'entrée principal
|
10 |
+
├── data/ # Données d'entrée et de sortie
|
11 |
+
│ ├── leaderboards.json # Liste des URLs des leaderboards à traiter
|
12 |
+
│ └── leaderboard_results.json # Résultats de l'extraction
|
13 |
+
├── src/ # Code source principal
|
14 |
+
│ ├── agent.py # Gestion de l'agent IA
|
15 |
+
│ ├── browser.py # Gestion du navigateur
|
16 |
+
│ └── tools.py # Outils utilisés par l'agent
|
17 |
+
├── experiments/ # Scripts expérimentaux
|
18 |
+
├── pyproject.toml # Configuration Poetry
|
19 |
+
└── README.md # Documentation
|
20 |
+
```
|
21 |
+
|
22 |
+
## Description
|
23 |
+
|
24 |
+
Ce projet utilise Playwright et smolagents pour naviguer sur les leaderboards Hugging Face et extraire les informations du premier modèle de chaque leaderboard. Les informations extraites incluent :
|
25 |
+
|
26 |
+
- Nom du modèle
|
27 |
+
- Score
|
28 |
+
- Position/rang
|
29 |
+
- Créateur/auteur
|
30 |
+
|
31 |
+
## Prérequis
|
32 |
+
|
33 |
+
- Python 3.10 ou supérieur
|
34 |
+
- Poetry (gestionnaire de dépendances)
|
35 |
+
|
36 |
+
## Installation
|
37 |
+
|
38 |
+
1. Assurez-vous d'avoir Python 3.10+ installé
|
39 |
+
2. Installez Poetry si ce n'est pas déjà fait : `pip install poetry`
|
40 |
+
3. Installez les dépendances : `poetry install`
|
41 |
+
|
42 |
+
## Configuration
|
43 |
+
|
44 |
+
1. Copiez le fichier `.env.example` vers `.env`
|
45 |
+
2. Configurez vos clés API dans le fichier `.env`
|
46 |
+
3. Modifiez le fichier `data/leaderboards.json` pour ajouter ou supprimer des URLs de leaderboards
|
47 |
+
|
48 |
+
## Utilisation
|
49 |
+
|
50 |
+
Pour exécuter le parser sur tous les leaderboards définis dans `data/leaderboards.json` :
|
51 |
+
|
52 |
+
```bash
|
53 |
+
poetry run leaderboard-parser
|
54 |
+
```
|
55 |
+
|
56 |
+
Ou directement :
|
57 |
+
|
58 |
+
```bash
|
59 |
+
poetry run python main.py
|
60 |
+
```
|
61 |
+
|
62 |
+
Les résultats seront sauvegardés dans `data/leaderboard_results.json`.
|
63 |
+
|
64 |
+
## Fonctionnement
|
65 |
+
|
66 |
+
L'outil utilise un agent IA basé sur `smolagents` pour :
|
67 |
+
|
68 |
+
1. Naviguer vers chaque URL de leaderboard
|
69 |
+
2. Analyser la page pour trouver le tableau de classement
|
70 |
+
3. Extraire les trois premiers modèles avec leurs scores
|
71 |
+
4. Sauvegarder les résultats dans un fichier JSON
|
72 |
+
|
73 |
+
## Développement
|
74 |
+
|
75 |
+
Pour ajouter un nouvel outil à l'agent, créez une fonction dans `src/tools.py` et décorez-la avec `@tool`.
|
76 |
+
|
77 |
+
Pour modifier les instructions données à l'agent, modifiez la variable `leaderboard_instructions` dans `src/agent.py`.
|
78 |
+
|
79 |
+
## Expériences
|
80 |
+
|
81 |
+
Le dossier `experiments/` contient des scripts expérimentaux qui ont été utilisés pour développer et tester différentes approches.
|
82 |
+
|
83 |
+
## Comment ça fonctionne
|
84 |
+
|
85 |
+
1. Le script charge les URLs des leaderboards depuis `data/leaderboards.json`
|
86 |
+
2. Pour chaque URL, il lance un navigateur et utilise un agent IA pour :
|
87 |
+
- Naviguer vers l'URL du leaderboard
|
88 |
+
- Analyser la page pour trouver les trois premiers modèles
|
89 |
+
- Extraire les informations pertinentes
|
90 |
+
3. Les résultats sont sauvegardés dans `data/leaderboard_results.json`
|
data/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
data/best.backup.json
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"category": "text generation",
|
4 |
+
"emoji": "📝",
|
5 |
+
"leaderboards": [
|
6 |
+
{
|
7 |
+
"uid": "6468923b99182de17844bf7b",
|
8 |
+
"additionnal_agent_rules": "",
|
9 |
+
"is_open_source": false
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"uid": "643d3016d2c1e08a5eca0c22",
|
13 |
+
"additionnal_agent_rules": "you have to check the 'only official providers' filter before trying to get the best models. It is mandatory to check this filter.",
|
14 |
+
"is_open_source": true
|
15 |
+
}
|
16 |
+
]
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"category": "uncensored text generation",
|
20 |
+
"emoji": "🔓",
|
21 |
+
"leaderboards": [
|
22 |
+
{
|
23 |
+
"uid": "65f0f612555caedb299e54d9",
|
24 |
+
"additionnal_agent_rules": "You have to remove models where are 'unavailable'",
|
25 |
+
"is_open_source": true
|
26 |
+
}
|
27 |
+
]
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"category": "image understanding",
|
31 |
+
"emoji": "📷",
|
32 |
+
"leaderboards": [
|
33 |
+
{
|
34 |
+
"uid": "6468923b99182de17844bf7b",
|
35 |
+
"additionnal_agent_rules": "you have to search for 'Arena (vision)' tab. You are searching for the best VLM models. It is mandatory to check this tab. If there is no informations about models in this tab, fail.",
|
36 |
+
"is_open_source": false
|
37 |
+
}
|
38 |
+
]
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"category": "agentic",
|
42 |
+
"emoji": "🤖",
|
43 |
+
"leaderboards": [
|
44 |
+
{
|
45 |
+
"uid": "67909d72a1832c8a7cdd4599",
|
46 |
+
"additionnal_agent_rules": "",
|
47 |
+
"is_open_source": false
|
48 |
+
}
|
49 |
+
]
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"category": "math",
|
53 |
+
"emoji": "🧮",
|
54 |
+
"leaderboards": [
|
55 |
+
{
|
56 |
+
"uid": "643d3016d2c1e08a5eca0c22",
|
57 |
+
"additionnal_agent_rules": "You have to click on MATH to sort the leaderboard by score. We are searching for the best math models.",
|
58 |
+
"is_open_source": true
|
59 |
+
}
|
60 |
+
]
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"category": "code",
|
64 |
+
"emoji": "💻",
|
65 |
+
"leaderboards": [
|
66 |
+
{
|
67 |
+
"uid": "6662b2c6cc6519da32cd6f4d",
|
68 |
+
"additionnal_agent_rules": "",
|
69 |
+
"is_open_source": false
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"uid": "657b23848e7790a347c7e4ea",
|
73 |
+
"additionnal_agent_rules": "",
|
74 |
+
"is_open_source": false
|
75 |
+
}
|
76 |
+
]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"category": "embedding",
|
80 |
+
"emoji": "📦",
|
81 |
+
"leaderboards": [
|
82 |
+
{
|
83 |
+
"uid": "633581939ac57cf2967be686",
|
84 |
+
"additionnal_agent_rules": "",
|
85 |
+
"is_open_source": false
|
86 |
+
}
|
87 |
+
]
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"category": "text to image",
|
91 |
+
"emoji": "🎨",
|
92 |
+
"leaderboards": [
|
93 |
+
{
|
94 |
+
"uid": "665e7241f8cb81b0a476eccb",
|
95 |
+
"additionnal_agent_rules": "",
|
96 |
+
"is_open_source": false
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"uid": "6670f4cffc615a6257ab35dd",
|
100 |
+
"additionnal_agent_rules": "You have to search for the 'image generation' leaderboard before trying to get the best models. If you don't find it. Fail."
|
101 |
+
}
|
102 |
+
]
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"category": "text to video",
|
106 |
+
"emoji": "🎬",
|
107 |
+
"leaderboards": [
|
108 |
+
{
|
109 |
+
"uid": "6719d6a46937670ca681151e",
|
110 |
+
"additionnal_agent_rules": "",
|
111 |
+
"is_open_source": false
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"uid": "6670f4cffc615a6257ab35dd",
|
115 |
+
"additionnal_agent_rules": "You have to search for the 'video generation' tab to access the leaderboard before trying to get the best models. If you don't find it. Fail."
|
116 |
+
}
|
117 |
+
]
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"category": "text to 3d",
|
121 |
+
"emoji": "🧊",
|
122 |
+
"leaderboards": [
|
123 |
+
{
|
124 |
+
"uid": "651f831f128d26b399db9ea5",
|
125 |
+
"additionnal_agent_rules": "",
|
126 |
+
"is_open_source": false
|
127 |
+
}
|
128 |
+
]
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"category": "text to speech",
|
132 |
+
"emoji": "🔊",
|
133 |
+
"leaderboards": [
|
134 |
+
{
|
135 |
+
"uid": "65a5a7c26145ebc6e7e39243",
|
136 |
+
"additionnal_agent_rules": "",
|
137 |
+
"is_open_source": false
|
138 |
+
}
|
139 |
+
]
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"category": "speech to text",
|
143 |
+
"emoji": "🎤",
|
144 |
+
"leaderboards": [
|
145 |
+
{
|
146 |
+
"uid": "64f9e6dd59eae6df399ba1e9",
|
147 |
+
"additionnal_agent_rules": "",
|
148 |
+
"is_open_source": true
|
149 |
+
}
|
150 |
+
]
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"category": "image to text",
|
154 |
+
"emoji": "📝",
|
155 |
+
"leaderboards": [
|
156 |
+
{
|
157 |
+
"uid": "65b0a64db233ea8ce65f0bc5",
|
158 |
+
"additionnal_agent_rules": "",
|
159 |
+
"is_open_source": false
|
160 |
+
}
|
161 |
+
]
|
162 |
+
},
|
163 |
+
{
|
164 |
+
"category": "image background removal",
|
165 |
+
"emoji": "🖼️",
|
166 |
+
"leaderboards": [
|
167 |
+
{
|
168 |
+
"uid": "674eea98c6a6ef2849b4a0ac",
|
169 |
+
"additionnal_agent_rules": "",
|
170 |
+
"is_open_source": false
|
171 |
+
}
|
172 |
+
]
|
173 |
+
},
|
174 |
+
{
|
175 |
+
"category": "medical QA tasks",
|
176 |
+
"emoji": "🩺",
|
177 |
+
"leaderboards": [
|
178 |
+
{
|
179 |
+
"uid": "65d70863ef58a69470ead2fc",
|
180 |
+
"additionnal_agent_rules": "",
|
181 |
+
"is_open_source": true
|
182 |
+
}
|
183 |
+
]
|
184 |
+
}
|
185 |
+
]
|
data/best_model_for_category_list.json
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"category": "text generation",
|
4 |
+
"emoji": "📝",
|
5 |
+
"leaderboards": [
|
6 |
+
{
|
7 |
+
"uid": "6468923b99182de17844bf7b",
|
8 |
+
"additionnal_agent_rules": "",
|
9 |
+
"is_open_source": false
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"uid": "643d3016d2c1e08a5eca0c22",
|
13 |
+
"additionnal_agent_rules": "you have to check the 'only official providers' filter before trying to get the best models. It is mandatory to check this filter.",
|
14 |
+
"is_open_source": true
|
15 |
+
}
|
16 |
+
]
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"category": "uncensored text generation",
|
20 |
+
"emoji": "🔓",
|
21 |
+
"leaderboards": [
|
22 |
+
{
|
23 |
+
"uid": "65f0f612555caedb299e54d9",
|
24 |
+
"additionnal_agent_rules": "You have to remove models where are '(no longer available)'. Keep all the other models.",
|
25 |
+
"is_open_source": true
|
26 |
+
}
|
27 |
+
]
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"category": "image understanding",
|
31 |
+
"emoji": "📷",
|
32 |
+
"leaderboards": [
|
33 |
+
{
|
34 |
+
"uid": "6468923b99182de17844bf7b",
|
35 |
+
"additionnal_agent_rules": "you have to search for 'Arena (vision)' tab. You are searching for the best VLM models. It is mandatory to check this tab. If there is no informations about models in this tab, fail.",
|
36 |
+
"is_open_source": false
|
37 |
+
}
|
38 |
+
]
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"category": "agentic",
|
42 |
+
"emoji": "🤖",
|
43 |
+
"leaderboards": [
|
44 |
+
{
|
45 |
+
"uid": "67909d72a1832c8a7cdd4599",
|
46 |
+
"additionnal_agent_rules": "",
|
47 |
+
"is_open_source": false
|
48 |
+
}
|
49 |
+
]
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"category": "math",
|
53 |
+
"emoji": "🧮",
|
54 |
+
"leaderboards": [
|
55 |
+
{
|
56 |
+
"uid": "643d3016d2c1e08a5eca0c22",
|
57 |
+
"additionnal_agent_rules": "You have to click on MATH to sort the leaderboard by score. We are searching for the best math models.",
|
58 |
+
"is_open_source": true
|
59 |
+
}
|
60 |
+
]
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"category": "code",
|
64 |
+
"emoji": "💻",
|
65 |
+
"leaderboards": [
|
66 |
+
{
|
67 |
+
"uid": "6662b2c6cc6519da32cd6f4d",
|
68 |
+
"additionnal_agent_rules": "",
|
69 |
+
"is_open_source": false
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"uid": "657b23848e7790a347c7e4ea",
|
73 |
+
"additionnal_agent_rules": "",
|
74 |
+
"is_open_source": false
|
75 |
+
}
|
76 |
+
]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"category": "embedding",
|
80 |
+
"emoji": "📦",
|
81 |
+
"leaderboards": [
|
82 |
+
{
|
83 |
+
"uid": "633581939ac57cf2967be686",
|
84 |
+
"additionnal_agent_rules": "",
|
85 |
+
"is_open_source": false
|
86 |
+
}
|
87 |
+
]
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"category": "text to image",
|
91 |
+
"emoji": "🎨",
|
92 |
+
"leaderboards": [
|
93 |
+
{
|
94 |
+
"uid": "665e7241f8cb81b0a476eccb",
|
95 |
+
"additionnal_agent_rules": "",
|
96 |
+
"is_open_source": false
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"uid": "6670f4cffc615a6257ab35dd",
|
100 |
+
"additionnal_agent_rules": "You have to search for the 'image generation' leaderboard before trying to get the best models. If you don't find it. Fail."
|
101 |
+
}
|
102 |
+
]
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"category": "text to video",
|
106 |
+
"emoji": "🎬",
|
107 |
+
"leaderboards": [
|
108 |
+
{
|
109 |
+
"uid": "65adcd10d6b10af9119fc960",
|
110 |
+
"additionnal_agent_rules": "",
|
111 |
+
"is_open_source": false
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"uid": "6719d6a46937670ca681151e",
|
115 |
+
"additionnal_agent_rules": "",
|
116 |
+
"is_open_source": false
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"uid": "6670f4cffc615a6257ab35dd",
|
120 |
+
"additionnal_agent_rules": "You have to search for the 'video generation' tab to access the leaderboard before trying to get the best models. If you don't find it. Fail."
|
121 |
+
}
|
122 |
+
]
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"category": "text to 3d",
|
126 |
+
"emoji": "🧊",
|
127 |
+
"leaderboards": [
|
128 |
+
{
|
129 |
+
"uid": "651f831f128d26b399db9ea5",
|
130 |
+
"additionnal_agent_rules": "",
|
131 |
+
"is_open_source": false
|
132 |
+
}
|
133 |
+
]
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"category": "text to speech",
|
137 |
+
"emoji": "🔊",
|
138 |
+
"leaderboards": [
|
139 |
+
{
|
140 |
+
"uid": "65a5a7c26145ebc6e7e39243",
|
141 |
+
"additionnal_agent_rules": "",
|
142 |
+
"is_open_source": false
|
143 |
+
}
|
144 |
+
]
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"category": "speech to text",
|
148 |
+
"emoji": "🎤",
|
149 |
+
"leaderboards": [
|
150 |
+
{
|
151 |
+
"uid": "64f9e6dd59eae6df399ba1e9",
|
152 |
+
"additionnal_agent_rules": "",
|
153 |
+
"is_open_source": true
|
154 |
+
}
|
155 |
+
]
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"category": "image to text",
|
159 |
+
"emoji": "📝",
|
160 |
+
"leaderboards": [
|
161 |
+
{
|
162 |
+
"uid": "65b0a64db233ea8ce65f0bc5",
|
163 |
+
"additionnal_agent_rules": "",
|
164 |
+
"is_open_source": false
|
165 |
+
}
|
166 |
+
]
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"category": "image background removal",
|
170 |
+
"emoji": "🖼️",
|
171 |
+
"leaderboards": [
|
172 |
+
{
|
173 |
+
"uid": "674eea98c6a6ef2849b4a0ac",
|
174 |
+
"additionnal_agent_rules": "",
|
175 |
+
"is_open_source": false
|
176 |
+
}
|
177 |
+
]
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"category": "medical QA tasks",
|
181 |
+
"emoji": "🩺",
|
182 |
+
"leaderboards": [
|
183 |
+
{
|
184 |
+
"uid": "65d70863ef58a69470ead2fc",
|
185 |
+
"additionnal_agent_rules": "",
|
186 |
+
"is_open_source": true
|
187 |
+
}
|
188 |
+
]
|
189 |
+
}
|
190 |
+
]
|
data/best_model_for_results.json.lock
ADDED
File without changes
|
experiments/simple_smolagent.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
"""
|
5 |
+
Simple SmoLAgent - Un script simple pour tester l'authentification avec Hugging Face
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import asyncio
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
from huggingface_hub import login
|
12 |
+
from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel
|
13 |
+
|
14 |
+
# Charger les variables d'environnement depuis le fichier .env
|
15 |
+
load_dotenv()
|
16 |
+
|
17 |
+
# Récupérer le token Hugging Face
|
18 |
+
hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
|
19 |
+
|
20 |
+
def main():
|
21 |
+
"""Fonction principale pour tester l'authentification avec Hugging Face."""
|
22 |
+
|
23 |
+
agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=HfApiModel())
|
24 |
+
|
25 |
+
response = agent.run("Search for the best music recommendations for a party at the Wayne's mansion.")
|
26 |
+
# Tester l'agent avec une requête simple
|
27 |
+
print(f"Réponse de l'agent: {response}")
|
28 |
+
|
29 |
+
|
30 |
+
if __name__ == "__main__":
|
31 |
+
main()
|
experiments/smolagent_parser.py
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
"""
|
5 |
+
SmoLAgent Parser - Extrait le premier modèle de chaque leaderboard Hugging Face
|
6 |
+
en utilisant Playwright et smolagents.
|
7 |
+
"""
|
8 |
+
|
9 |
+
import json
|
10 |
+
import os
|
11 |
+
import asyncio
|
12 |
+
from pathlib import Path
|
13 |
+
from typing import List, Dict, Any, Optional
|
14 |
+
from dotenv import load_dotenv
|
15 |
+
from huggingface_hub import login
|
16 |
+
|
17 |
+
from playwright.async_api import async_playwright
|
18 |
+
from smolagents import CodeAgent
|
19 |
+
from smolagents.models import HfApiModel
|
20 |
+
from smolagents.tools import Tool
|
21 |
+
|
22 |
+
# Charger les variables d'environnement depuis le fichier .env
|
23 |
+
load_dotenv()
|
24 |
+
|
25 |
+
# Récupérer le token Hugging Face
|
26 |
+
hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
|
27 |
+
|
28 |
+
# Charger les leaderboards depuis le fichier JSON
|
29 |
+
def load_leaderboards() -> List[str]:
|
30 |
+
"""Charger les URLs des leaderboards depuis le fichier JSON."""
|
31 |
+
with open("leaderboards.json", "r") as f:
|
32 |
+
return json.load(f)
|
33 |
+
|
34 |
+
# Définir un outil pour utiliser Playwright
|
35 |
+
class PlaywrightBrowserTool(Tool):
|
36 |
+
"""Outil pour interagir avec un navigateur web via Playwright."""
|
37 |
+
|
38 |
+
name = "browser"
|
39 |
+
description = "Outil pour interagir avec un navigateur web via Playwright."
|
40 |
+
inputs = {
|
41 |
+
"goto": {
|
42 |
+
"url": {
|
43 |
+
"type": "string",
|
44 |
+
"description": "L'URL vers laquelle naviguer"
|
45 |
+
}
|
46 |
+
},
|
47 |
+
"get_content": {},
|
48 |
+
"get_title": {},
|
49 |
+
"take_screenshot": {
|
50 |
+
"path": {
|
51 |
+
"type": "string",
|
52 |
+
"description": "Le chemin où enregistrer la capture d'écran"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"run_js": {
|
56 |
+
"script": {
|
57 |
+
"type": "string",
|
58 |
+
"description": "Le code JavaScript à exécuter dans le contexte de la page"
|
59 |
+
}
|
60 |
+
},
|
61 |
+
"wait_for": {
|
62 |
+
"selector": {
|
63 |
+
"type": "string",
|
64 |
+
"description": "Le sélecteur CSS à attendre"
|
65 |
+
},
|
66 |
+
"timeout": {
|
67 |
+
"type": "integer",
|
68 |
+
"description": "Le temps maximum d'attente en millisecondes"
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"click": {
|
72 |
+
"selector": {
|
73 |
+
"type": "string",
|
74 |
+
"description": "Le sélecteur CSS de l'élément à cliquer"
|
75 |
+
}
|
76 |
+
},
|
77 |
+
"fill": {
|
78 |
+
"selector": {
|
79 |
+
"type": "string",
|
80 |
+
"description": "Le sélecteur CSS du champ de formulaire"
|
81 |
+
},
|
82 |
+
"value": {
|
83 |
+
"type": "string",
|
84 |
+
"description": "La valeur à remplir dans le champ de formulaire"
|
85 |
+
}
|
86 |
+
}
|
87 |
+
}
|
88 |
+
output_type = "any"
|
89 |
+
|
90 |
+
def __init__(self, page):
|
91 |
+
self.page = page
|
92 |
+
|
93 |
+
async def goto(self, url: str) -> str:
|
94 |
+
"""Naviguer vers une URL."""
|
95 |
+
await self.page.goto(url, wait_until="networkidle", timeout=60000)
|
96 |
+
return f"Navigué vers {url}"
|
97 |
+
|
98 |
+
async def get_content(self) -> str:
|
99 |
+
"""Obtenir le contenu HTML de la page."""
|
100 |
+
return await self.page.content()
|
101 |
+
|
102 |
+
async def get_title(self) -> str:
|
103 |
+
"""Obtenir le titre de la page."""
|
104 |
+
return await self.page.title()
|
105 |
+
|
106 |
+
async def take_screenshot(self, path: str = "screenshot.png") -> str:
|
107 |
+
"""Prendre une capture d'écran de la page."""
|
108 |
+
await self.page.screenshot(path=path)
|
109 |
+
return f"Capture d'écran enregistrée dans {path}"
|
110 |
+
|
111 |
+
async def run_js(self, script: str) -> Any:
|
112 |
+
"""Exécuter du JavaScript dans le contexte de la page."""
|
113 |
+
return await self.page.evaluate(script)
|
114 |
+
|
115 |
+
async def wait_for(self, selector: str, timeout: int = 30000) -> str:
|
116 |
+
"""Attendre qu'un élément correspondant au sélecteur apparaisse."""
|
117 |
+
await self.page.wait_for_selector(selector, timeout=timeout)
|
118 |
+
return f"Élément avec le sélecteur '{selector}' trouvé"
|
119 |
+
|
120 |
+
async def click(self, selector: str) -> str:
|
121 |
+
"""Cliquer sur un élément correspondant au sélecteur."""
|
122 |
+
await self.page.click(selector)
|
123 |
+
return f"Cliqué sur l'élément avec le sélecteur '{selector}'"
|
124 |
+
|
125 |
+
async def fill(self, selector: str, value: str) -> str:
|
126 |
+
"""Remplir un champ de formulaire."""
|
127 |
+
await self.page.fill(selector, value)
|
128 |
+
return f"Rempli '{value}' dans l'élément avec le sélecteur '{selector}'"
|
129 |
+
|
130 |
+
async def extract_first_model(url: str) -> Optional[Dict[str, Any]]:
|
131 |
+
"""
|
132 |
+
Extraire le premier modèle d'un leaderboard en utilisant un agent.
|
133 |
+
|
134 |
+
Args:
|
135 |
+
url: L'URL du leaderboard
|
136 |
+
|
137 |
+
Returns:
|
138 |
+
Un dictionnaire contenant les informations sur le premier modèle, ou None si l'extraction a échoué
|
139 |
+
"""
|
140 |
+
async with async_playwright() as p:
|
141 |
+
browser = await p.chromium.launch(headless=False) # Mettre à True pour la production
|
142 |
+
page = await browser.new_page()
|
143 |
+
|
144 |
+
try:
|
145 |
+
# Créer l'outil Playwright
|
146 |
+
browser_tool = PlaywrightBrowserTool(page)
|
147 |
+
|
148 |
+
# Créer l'agent
|
149 |
+
agent = CodeAgent(
|
150 |
+
tools=[browser_tool],
|
151 |
+
model=HfApiModel()
|
152 |
+
)
|
153 |
+
|
154 |
+
# Exécuter l'agent
|
155 |
+
prompt = f"""
|
156 |
+
Extrais les informations sur le premier modèle du leaderboard à l'URL suivante: {url}
|
157 |
+
|
158 |
+
Utilise l'outil browser pour naviguer sur la page et extraire les informations suivantes:
|
159 |
+
- Nom du modèle
|
160 |
+
- Score
|
161 |
+
- Position/rang
|
162 |
+
- Créateur/auteur
|
163 |
+
|
164 |
+
Retourne les informations sous forme de dictionnaire Python.
|
165 |
+
"""
|
166 |
+
|
167 |
+
result = await agent.run(prompt)
|
168 |
+
print(f"Résultat brut de l'agent: {result}")
|
169 |
+
|
170 |
+
# Essayer de parser le résultat comme un dictionnaire
|
171 |
+
try:
|
172 |
+
# L'agent peut retourner une représentation textuelle d'un dictionnaire
|
173 |
+
if isinstance(result, str):
|
174 |
+
# Essayer de trouver une structure de dictionnaire dans la chaîne
|
175 |
+
import re
|
176 |
+
dict_match = re.search(r'\{.*\}', result, re.DOTALL)
|
177 |
+
if dict_match:
|
178 |
+
dict_str = dict_match.group(0)
|
179 |
+
# Remplacer les guillemets simples par des guillemets doubles pour un JSON valide
|
180 |
+
dict_str = dict_str.replace("'", '"')
|
181 |
+
return json.loads(dict_str)
|
182 |
+
return {"raw_result": result}
|
183 |
+
return result
|
184 |
+
except Exception as e:
|
185 |
+
print(f"Erreur lors du parsing du résultat: {e}")
|
186 |
+
return {"raw_result": str(result)}
|
187 |
+
except Exception as e:
|
188 |
+
print(f"Erreur lors de l'extraction des données de {url}: {e}")
|
189 |
+
await page.screenshot(path=f"error_{url.replace('://', '_').replace('/', '_')}.png")
|
190 |
+
return {"error": str(e)}
|
191 |
+
finally:
|
192 |
+
await browser.close()
|
193 |
+
|
194 |
+
async def main():
|
195 |
+
"""Fonction principale pour traiter tous les leaderboards."""
|
196 |
+
# Se connecter à Hugging Face
|
197 |
+
if hf_token:
|
198 |
+
print("Token Hugging Face trouvé dans le fichier .env")
|
199 |
+
login(token=hf_token)
|
200 |
+
print("Connexion à Hugging Face réussie!")
|
201 |
+
else:
|
202 |
+
print("Erreur: Token Hugging Face non trouvé dans le fichier .env")
|
203 |
+
return
|
204 |
+
|
205 |
+
leaderboards = load_leaderboards()
|
206 |
+
results = {}
|
207 |
+
|
208 |
+
for url in leaderboards:
|
209 |
+
print(f"Traitement du leaderboard: {url}")
|
210 |
+
result = await extract_first_model(url)
|
211 |
+
results[url] = result
|
212 |
+
print(f"Résultat: {result}")
|
213 |
+
|
214 |
+
# Sauvegarder les résultats dans un fichier JSON
|
215 |
+
with open("results_smolagent.json", "w") as f:
|
216 |
+
json.dump(results, f, indent=2)
|
217 |
+
|
218 |
+
print(f"Résultats sauvegardés dans results_smolagent.json")
|
219 |
+
|
220 |
+
if __name__ == "__main__":
|
221 |
+
asyncio.run(main())
|
experiments/vision_web_browser.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
from io import BytesIO
|
3 |
+
from time import sleep
|
4 |
+
|
5 |
+
import helium
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from PIL import Image
|
8 |
+
from selenium import webdriver
|
9 |
+
from selenium.webdriver.common.by import By
|
10 |
+
from selenium.webdriver.common.keys import Keys
|
11 |
+
|
12 |
+
from smolagents import CodeAgent, DuckDuckGoSearchTool, tool
|
13 |
+
from smolagents.agents import ActionStep
|
14 |
+
from smolagents.cli import load_model
|
15 |
+
|
16 |
+
|
17 |
+
github_request = """
|
18 |
+
I'm trying to find how hard I have to work to get a repo in github.com/trending.
|
19 |
+
Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
|
20 |
+
""" # The agent is able to achieve this request only when powered by GPT-4o or Claude-3.5-sonnet.
|
21 |
+
|
22 |
+
search_request = """
|
23 |
+
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
|
24 |
+
"""
|
25 |
+
|
26 |
+
|
27 |
+
def parse_arguments():
|
28 |
+
parser = argparse.ArgumentParser(description="Run a web browser automation script with a specified model.")
|
29 |
+
parser.add_argument(
|
30 |
+
"prompt",
|
31 |
+
type=str,
|
32 |
+
nargs="?", # Makes it optional
|
33 |
+
default=github_request,
|
34 |
+
help="The prompt to run with the agent",
|
35 |
+
)
|
36 |
+
parser.add_argument(
|
37 |
+
"--model-type",
|
38 |
+
type=str,
|
39 |
+
default="LiteLLMModel",
|
40 |
+
help="The model type to use (e.g., OpenAIServerModel, LiteLLMModel, TransformersModel, HfApiModel)",
|
41 |
+
)
|
42 |
+
parser.add_argument(
|
43 |
+
"--model-id",
|
44 |
+
type=str,
|
45 |
+
default="gpt-4o",
|
46 |
+
help="The model ID to use for the specified model type",
|
47 |
+
)
|
48 |
+
return parser.parse_args()
|
49 |
+
|
50 |
+
|
51 |
+
def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
|
52 |
+
sleep(1.0) # Let JavaScript animations happen before taking the screenshot
|
53 |
+
driver = helium.get_driver()
|
54 |
+
current_step = memory_step.step_number
|
55 |
+
if driver is not None:
|
56 |
+
for previous_memory_step in agent.memory.steps: # Remove previous screenshots from logs for lean processing
|
57 |
+
if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
|
58 |
+
previous_memory_step.observations_images = None
|
59 |
+
png_bytes = driver.get_screenshot_as_png()
|
60 |
+
image = Image.open(BytesIO(png_bytes))
|
61 |
+
print(f"Captured a browser screenshot: {image.size} pixels")
|
62 |
+
memory_step.observations_images = [image.copy()] # Create a copy to ensure it persists, important!
|
63 |
+
|
64 |
+
# Update observations with current URL
|
65 |
+
url_info = f"Current url: {driver.current_url}"
|
66 |
+
memory_step.observations = (
|
67 |
+
url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
|
68 |
+
)
|
69 |
+
return
|
70 |
+
|
71 |
+
|
72 |
+
@tool
|
73 |
+
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
|
74 |
+
"""
|
75 |
+
Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
|
76 |
+
Args:
|
77 |
+
text: The text to search for
|
78 |
+
nth_result: Which occurrence to jump to (default: 1)
|
79 |
+
"""
|
80 |
+
elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
|
81 |
+
if nth_result > len(elements):
|
82 |
+
raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
|
83 |
+
result = f"Found {len(elements)} matches for '{text}'."
|
84 |
+
elem = elements[nth_result - 1]
|
85 |
+
driver.execute_script("arguments[0].scrollIntoView(true);", elem)
|
86 |
+
result += f"Focused on element {nth_result} of {len(elements)}"
|
87 |
+
return result
|
88 |
+
|
89 |
+
|
90 |
+
@tool
|
91 |
+
def go_back() -> None:
|
92 |
+
"""Goes back to previous page."""
|
93 |
+
driver.back()
|
94 |
+
|
95 |
+
|
96 |
+
@tool
|
97 |
+
def close_popups() -> str:
|
98 |
+
"""
|
99 |
+
Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners.
|
100 |
+
"""
|
101 |
+
webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
|
102 |
+
|
103 |
+
|
104 |
+
def initialize_driver():
|
105 |
+
"""Initialize the Selenium WebDriver."""
|
106 |
+
chrome_options = webdriver.ChromeOptions()
|
107 |
+
chrome_options.add_argument("--force-device-scale-factor=1")
|
108 |
+
chrome_options.add_argument("--window-size=1000,1350")
|
109 |
+
chrome_options.add_argument("--disable-pdf-viewer")
|
110 |
+
chrome_options.add_argument("--window-position=0,0")
|
111 |
+
return helium.start_chrome(headless=False, options=chrome_options)
|
112 |
+
|
113 |
+
|
114 |
+
def initialize_agent(model):
|
115 |
+
"""Initialize the CodeAgent with the specified model."""
|
116 |
+
return CodeAgent(
|
117 |
+
tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],
|
118 |
+
model=model,
|
119 |
+
additional_authorized_imports=["helium"],
|
120 |
+
step_callbacks=[save_screenshot],
|
121 |
+
max_steps=20,
|
122 |
+
verbosity_level=2,
|
123 |
+
)
|
124 |
+
|
125 |
+
|
126 |
+
helium_instructions = """
|
127 |
+
Use your web_search tool when you want to get Google search results.
|
128 |
+
Then you can use helium to access websites. Don't use helium for Google search, only for navigating websites!
|
129 |
+
Don't bother about the helium driver, it's already managed.
|
130 |
+
We've already ran "from helium import *"
|
131 |
+
Then you can go to pages!
|
132 |
+
Code:
|
133 |
+
```py
|
134 |
+
go_to('github.com/trending')
|
135 |
+
```<end_code>
|
136 |
+
|
137 |
+
You can directly click clickable elements by inputting the text that appears on them.
|
138 |
+
Code:
|
139 |
+
```py
|
140 |
+
click("Top products")
|
141 |
+
```<end_code>
|
142 |
+
|
143 |
+
If it's a link:
|
144 |
+
Code:
|
145 |
+
```py
|
146 |
+
click(Link("Top products"))
|
147 |
+
```<end_code>
|
148 |
+
|
149 |
+
If you try to interact with an element and it's not found, you'll get a LookupError.
|
150 |
+
In general stop your action after each button click to see what happens on your screenshot.
|
151 |
+
Never try to login in a page.
|
152 |
+
|
153 |
+
To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
|
154 |
+
Code:
|
155 |
+
```py
|
156 |
+
scroll_down(num_pixels=1200) # This will scroll one viewport down
|
157 |
+
```<end_code>
|
158 |
+
|
159 |
+
When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
|
160 |
+
Just use your built-in tool `close_popups` to close them:
|
161 |
+
Code:
|
162 |
+
```py
|
163 |
+
close_popups()
|
164 |
+
```<end_code>
|
165 |
+
|
166 |
+
You can use .exists() to check for the existence of an element. For example:
|
167 |
+
Code:
|
168 |
+
```py
|
169 |
+
if Text('Accept cookies?').exists():
|
170 |
+
click('I accept')
|
171 |
+
```<end_code>
|
172 |
+
|
173 |
+
Proceed in several steps rather than trying to solve the task in one shot.
|
174 |
+
And at the end, only when you have your answer, return your final answer.
|
175 |
+
Code:
|
176 |
+
```py
|
177 |
+
final_answer("YOUR_ANSWER_HERE")
|
178 |
+
```<end_code>
|
179 |
+
|
180 |
+
If pages seem stuck on loading, you might have to wait, for instance `import time` and run `time.sleep(5.0)`. But don't overuse this!
|
181 |
+
To list elements on page, DO NOT try code-based element searches like 'contributors = find_all(S("ol > li"))': just look at the latest screenshot you have and read it visually, or use your tool search_item_ctrl_f.
|
182 |
+
Of course, you can act on buttons like a user would do when navigating.
|
183 |
+
After each code blob you write, you will be automatically provided with an updated screenshot of the browser and the current browser url.
|
184 |
+
But beware that the screenshot will only be taken at the end of the whole action, it won't see intermediate states.
|
185 |
+
Don't kill the browser.
|
186 |
+
When you have modals or cookie banners on screen, you should get rid of them before you can click anything else.
|
187 |
+
"""
|
188 |
+
|
189 |
+
|
190 |
+
def main():
|
191 |
+
# Load environment variables
|
192 |
+
load_dotenv()
|
193 |
+
|
194 |
+
# Parse command line arguments
|
195 |
+
args = parse_arguments()
|
196 |
+
|
197 |
+
# Initialize the model based on the provided arguments
|
198 |
+
model = load_model(args.model_type, args.model_id)
|
199 |
+
|
200 |
+
global driver
|
201 |
+
driver = initialize_driver()
|
202 |
+
agent = initialize_agent(model)
|
203 |
+
|
204 |
+
# Run the agent with the provided prompt
|
205 |
+
agent.python_executor("from helium import *", agent.state)
|
206 |
+
agent.run(args.prompt + helium_instructions)
|
207 |
+
|
208 |
+
|
209 |
+
if __name__ == "__main__":
|
210 |
+
main()
|
main.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
"""
|
3 |
+
Main script for the leaderboard parser.
|
4 |
+
This script processes leaderboards specified in data/best_model_for_category_list.json file
|
5 |
+
by matching their UIDs with hosts in data/final_leaderboards.json.
|
6 |
+
|
7 |
+
Environment variables:
|
8 |
+
HUGGING_FACE_HUB_TOKEN: Authentication token for Hugging Face Hub (required)
|
9 |
+
HUGGING_FACE_STORAGE_REPO: Target dataset name on the Hub (optional, default: leaderboard-explorer/leaderboard_explorer)
|
10 |
+
LEADERBOARD_REPROCESS_INTERVAL_HOURS: Interval in hours between leaderboard processing runs (default: 24)
|
11 |
+
"""
|
12 |
+
import argparse
|
13 |
+
import logging
|
14 |
+
from dotenv import load_dotenv
|
15 |
+
import uvicorn
|
16 |
+
import sys
|
17 |
+
|
18 |
+
# Import from src modules
|
19 |
+
from src.processor import process_leaderboards
|
20 |
+
from src.server import app, initialize_server
|
21 |
+
from src.scheduler import initialize_scheduler, start_scheduler
|
22 |
+
|
23 |
+
# Configure logging
|
24 |
+
logging.basicConfig(
|
25 |
+
level=logging.INFO,
|
26 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
27 |
+
)
|
28 |
+
logger = logging.getLogger("leaderboard-parser")
|
29 |
+
|
30 |
+
def main():
|
31 |
+
"""
|
32 |
+
Main function to process leaderboards specified in best_model_for_category_list.json.
|
33 |
+
"""
|
34 |
+
# Parse command line arguments
|
35 |
+
parser = argparse.ArgumentParser(description="Leaderboard Parser")
|
36 |
+
parser.add_argument("--clean", action="store_true", help="Clean the results file before starting")
|
37 |
+
parser.add_argument("--force-retry-uid", help="Force retry for a specific leaderboard UID")
|
38 |
+
parser.add_argument("--force-retry-category", help="Force retry for all leaderboards of a specific category")
|
39 |
+
parser.add_argument("--upload-only", action="store_true", help="Only upload local files to the Hub without processing leaderboards")
|
40 |
+
parser.add_argument("--local-only", action="store_true", help="Local mode only: do not download from the Hub and do not upload to the Hub")
|
41 |
+
parser.add_argument("--retry-rejected", action="store_true", help="Force reprocessing of rejected leaderboards even if it's been less than 24h")
|
42 |
+
parser.add_argument("--server", action="store_true", help="Run as a web server with scheduled processing")
|
43 |
+
args = parser.parse_args()
|
44 |
+
|
45 |
+
# Load environment variables
|
46 |
+
load_dotenv()
|
47 |
+
|
48 |
+
# Check if we should run in server mode
|
49 |
+
if args.server:
|
50 |
+
run_server_mode()
|
51 |
+
return
|
52 |
+
|
53 |
+
# Convert args to dict for process_leaderboards
|
54 |
+
args_dict = vars(args)
|
55 |
+
|
56 |
+
# Process the leaderboards
|
57 |
+
success, message = process_leaderboards(args_dict)
|
58 |
+
|
59 |
+
if success:
|
60 |
+
logger.info(message)
|
61 |
+
return 0
|
62 |
+
else:
|
63 |
+
logger.error(message)
|
64 |
+
return 1
|
65 |
+
|
66 |
+
def run_server_mode():
|
67 |
+
"""Run the application in server mode with periodic processing"""
|
68 |
+
# Initialize server and scheduler with the process_leaderboards function
|
69 |
+
initialize_server(process_leaderboards)
|
70 |
+
initialize_scheduler(process_leaderboards)
|
71 |
+
|
72 |
+
# Start the scheduler thread
|
73 |
+
scheduler = start_scheduler()
|
74 |
+
|
75 |
+
try:
|
76 |
+
# Log startup information
|
77 |
+
logger.info("Running in server mode with periodic processing")
|
78 |
+
|
79 |
+
# Run the FastAPI server
|
80 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
81 |
+
except KeyboardInterrupt:
|
82 |
+
logger.info("Server stopped by user")
|
83 |
+
except Exception as e:
|
84 |
+
logger.error(f"Error running server: {e}")
|
85 |
+
|
86 |
+
if __name__ == "__main__":
|
87 |
+
sys.exit(main())
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "leaderboard-parser"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "Parser for Hugging Face leaderboards"
|
5 |
+
authors = ["Thibaud Frere"]
|
6 |
+
readme = "README.md"
|
7 |
+
packages = [{include = "src"}]
|
8 |
+
package-mode = false
|
9 |
+
|
10 |
+
[tool.poetry.dependencies]
|
11 |
+
python = ">=3.10,<3.14"
|
12 |
+
python-dotenv = "^1.0.1"
|
13 |
+
opentelemetry-sdk = "^1.30.0"
|
14 |
+
opentelemetry-exporter-otlp = "^1.30.0"
|
15 |
+
openinference-instrumentation-smolagents = "^0.1.6"
|
16 |
+
helium = "^5.1.1"
|
17 |
+
huggingface-hub = "^0.29.1"
|
18 |
+
fastapi = "^0.115.11"
|
19 |
+
uvicorn = "^0.34.0"
|
20 |
+
smolagents = {version = "^1.9.2", extras = ["litellm"]}
|
21 |
+
|
22 |
+
[tool.poetry.scripts]
|
23 |
+
leaderboard-parser = "main:main"
|
24 |
+
|
25 |
+
[build-system]
|
26 |
+
requires = ["poetry-core"]
|
27 |
+
build-backend = "poetry.core.masonry.api"
|
scripts/test_agent.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
"""
|
3 |
+
Script de test pour l'agent de parsing de leaderboard.
|
4 |
+
Ce script permet de tester l'agent en standalone en lui donnant une URL.
|
5 |
+
|
6 |
+
Usage:
|
7 |
+
python test_agent.py <url>
|
8 |
+
|
9 |
+
Exemple:
|
10 |
+
python test_agent.py https://lmarena-ai-chatbot-arena-leaderboard.hf.space
|
11 |
+
"""
|
12 |
+
import json
|
13 |
+
import os
|
14 |
+
import sys
|
15 |
+
import argparse
|
16 |
+
import datetime
|
17 |
+
from dotenv import load_dotenv
|
18 |
+
|
19 |
+
# Assurez-vous que le répertoire parent est dans le chemin d'importation
|
20 |
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
21 |
+
root_dir = os.path.dirname(script_dir)
|
22 |
+
sys.path.insert(0, root_dir)
|
23 |
+
|
24 |
+
from src.agent import get_default_model, process_leaderboard
|
25 |
+
from src.browser import initialize_driver, close_driver, take_initial_screenshot
|
26 |
+
|
27 |
+
|
28 |
+
def main():
|
29 |
+
"""
|
30 |
+
Fonction principale qui teste l'agent sur une URL spécifiée.
|
31 |
+
"""
|
32 |
+
# Charger les variables d'environnement
|
33 |
+
load_dotenv()
|
34 |
+
|
35 |
+
# Parse command line arguments
|
36 |
+
parser = argparse.ArgumentParser(description="Test de l'agent de parsing de leaderboard")
|
37 |
+
parser.add_argument("url", help="URL du leaderboard à parser")
|
38 |
+
parser.add_argument("--save", help="Chemin où sauvegarder le résultat JSON (optionnel)")
|
39 |
+
parser.add_argument("--uid", help="UID à utiliser pour la capture d'écran (optionnel)")
|
40 |
+
parser.add_argument("--wait", type=int, default=10, help="Temps d'attente initial en secondes (défaut: 10)")
|
41 |
+
args = parser.parse_args()
|
42 |
+
|
43 |
+
# Assurez-vous que nous sommes dans le bon répertoire
|
44 |
+
os.chdir(root_dir)
|
45 |
+
|
46 |
+
# Vérifier que la clé API est disponible
|
47 |
+
if not os.getenv("OPENAI_API_KEY"):
|
48 |
+
print("ERREUR: La variable d'environnement OPENAI_API_KEY n'est pas définie.")
|
49 |
+
print("Veuillez créer un fichier .env à la racine du projet avec votre clé API.")
|
50 |
+
print("Exemple: OPENAI_API_KEY=votre-clé-api")
|
51 |
+
sys.exit(1)
|
52 |
+
|
53 |
+
# Créer le répertoire d'images si nécessaire
|
54 |
+
if args.uid:
|
55 |
+
images_dir = os.path.join("data", "images")
|
56 |
+
os.makedirs(images_dir, exist_ok=True)
|
57 |
+
|
58 |
+
# Obtenir le modèle par défaut
|
59 |
+
model = get_default_model()
|
60 |
+
|
61 |
+
print(f"Test de l'agent sur l'URL: {args.url}")
|
62 |
+
if args.uid:
|
63 |
+
print(f"UID utilisé pour la capture d'écran: {args.uid}")
|
64 |
+
print(f"Temps d'attente initial: {args.wait} secondes")
|
65 |
+
|
66 |
+
# Initialiser le navigateur et prendre une capture d'écran initiale avec le temps d'attente personnalisé
|
67 |
+
initialize_driver()
|
68 |
+
|
69 |
+
if args.uid:
|
70 |
+
# Remplacer la fonction take_initial_screenshot par une version personnalisée avec le temps d'attente spécifié
|
71 |
+
import time
|
72 |
+
from helium import go_to
|
73 |
+
from io import BytesIO
|
74 |
+
from PIL import Image
|
75 |
+
|
76 |
+
# Naviguer vers l'URL
|
77 |
+
go_to(args.url)
|
78 |
+
|
79 |
+
# Attendre que la page se charge
|
80 |
+
print(f"Attente de {args.wait} secondes pour le chargement complet de la page...")
|
81 |
+
time.sleep(args.wait)
|
82 |
+
|
83 |
+
# Prendre la capture d'écran
|
84 |
+
from src.browser import driver
|
85 |
+
png_bytes = driver.get_screenshot_as_png()
|
86 |
+
image = Image.open(BytesIO(png_bytes))
|
87 |
+
|
88 |
+
# Sauvegarder la capture d'écran
|
89 |
+
images_dir = os.path.join("data", "images")
|
90 |
+
os.makedirs(images_dir, exist_ok=True)
|
91 |
+
screenshot_path = os.path.join(images_dir, f"{args.uid}.png")
|
92 |
+
image.save(screenshot_path)
|
93 |
+
|
94 |
+
print(f"Capture d'écran initiale sauvegardée dans: {screenshot_path}")
|
95 |
+
|
96 |
+
# Fermer le navigateur pour le réinitialiser
|
97 |
+
close_driver()
|
98 |
+
|
99 |
+
# Traiter le leaderboard
|
100 |
+
result = process_leaderboard(args.url, model, 0, args.uid)
|
101 |
+
|
102 |
+
# Ajouter des métadonnées
|
103 |
+
result["url"] = args.url
|
104 |
+
if args.uid:
|
105 |
+
result["uid"] = args.uid
|
106 |
+
result["screenshot"] = f"images/{args.uid}.png" if os.path.exists(os.path.join("data", "images", f"{args.uid}.png")) else None
|
107 |
+
|
108 |
+
# Afficher le résultat en JSON
|
109 |
+
json_result = json.dumps(result, indent=2)
|
110 |
+
print("\nRésultat JSON:")
|
111 |
+
print(json_result)
|
112 |
+
|
113 |
+
# Sauvegarder le résultat si demandé
|
114 |
+
if args.save:
|
115 |
+
with open(args.save, "w") as f:
|
116 |
+
f.write(json_result)
|
117 |
+
print(f"\nRésultat sauvegardé dans: {args.save}")
|
118 |
+
|
119 |
+
|
120 |
+
if __name__ == "__main__":
|
121 |
+
main()
|
src/__pycache__/agent.cpython-310.pyc
ADDED
Binary file (13 kB). View file
|
|
src/__pycache__/browser.cpython-310.pyc
ADDED
Binary file (2.27 kB). View file
|
|
src/__pycache__/browser_utils.cpython-310.pyc
ADDED
Binary file (3.56 kB). View file
|
|
src/__pycache__/file_utils.cpython-310.pyc
ADDED
Binary file (8.2 kB). View file
|
|
src/__pycache__/hub_utils.cpython-310.pyc
ADDED
Binary file (4.29 kB). View file
|
|
src/__pycache__/leaderboard_processor.cpython-310.pyc
ADDED
Binary file (3.65 kB). View file
|
|
src/__pycache__/processor.cpython-310.pyc
ADDED
Binary file (9.87 kB). View file
|
|
src/__pycache__/scheduler.cpython-310.pyc
ADDED
Binary file (2.52 kB). View file
|
|
src/__pycache__/server.cpython-310.pyc
ADDED
Binary file (2.44 kB). View file
|
|
src/__pycache__/tools.cpython-310.pyc
ADDED
Binary file (11.3 kB). View file
|
|
src/agents/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (413 Bytes). View file
|
|
src/agents/__pycache__/agent.cpython-310.pyc
ADDED
Binary file (13.1 kB). View file
|
|
src/agents/__pycache__/agent_core.cpython-310.pyc
ADDED
Binary file (1.63 kB). View file
|
|
src/agents/__pycache__/agent_instructions.cpython-310.pyc
ADDED
Binary file (11.2 kB). View file
|
|
src/agents/__pycache__/agent_processor.cpython-310.pyc
ADDED
Binary file (5.58 kB). View file
|
|
src/agents/__pycache__/agent_tools.cpython-310.pyc
ADDED
Binary file (13.2 kB). View file
|
|
src/agents/__pycache__/browser.cpython-310.pyc
ADDED
Binary file (4.05 kB). View file
|
|
src/agents/__pycache__/prompts.cpython-310.pyc
ADDED
Binary file (11 kB). View file
|
|
src/agents/__pycache__/tools.cpython-310.pyc
ADDED
Binary file (13.6 kB). View file
|
|
src/agents/__pycache__/validators.cpython-310.pyc
ADDED
Binary file (3.28 kB). View file
|
|
src/agents/browser.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Browser management for the leaderboard agent.
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import gc
|
6 |
+
import logging
|
7 |
+
from io import BytesIO
|
8 |
+
from time import sleep
|
9 |
+
|
10 |
+
import helium
|
11 |
+
from PIL import Image
|
12 |
+
from selenium import webdriver
|
13 |
+
from smolagents import CodeAgent
|
14 |
+
from smolagents.agents import ActionStep
|
15 |
+
|
16 |
+
# Configuration du logger
|
17 |
+
logger = logging.getLogger("leaderboard-parser")
|
18 |
+
|
19 |
+
# Global driver variable
|
20 |
+
driver = None
|
21 |
+
|
22 |
+
|
23 |
+
def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
|
24 |
+
"""
|
25 |
+
Save a screenshot of the current browser state in memory for the agent.
|
26 |
+
This is used as a callback for the agent to visualize the page.
|
27 |
+
The screenshot is only kept in memory and not saved to disk.
|
28 |
+
"""
|
29 |
+
sleep(2.0) # Increased to allow time for JavaScript animations
|
30 |
+
current_step = memory_step.step_number
|
31 |
+
if driver is not None:
|
32 |
+
for previous_memory_step in agent.memory.steps: # Remove previous screenshots from logs for lean processing
|
33 |
+
if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
|
34 |
+
previous_memory_step.observations_images = None
|
35 |
+
|
36 |
+
# Capture screenshot for agent visualization only (not saved to disk)
|
37 |
+
png_bytes = driver.get_screenshot_as_png()
|
38 |
+
image = Image.open(BytesIO(png_bytes))
|
39 |
+
print(f"Captured a browser screenshot for agent: {image.size} pixels")
|
40 |
+
memory_step.observations_images = [image.copy()] # Create a copy to ensure it persists, important!
|
41 |
+
|
42 |
+
# Update observations with current URL
|
43 |
+
url_info = f"Current url: {driver.current_url}"
|
44 |
+
memory_step.observations = (
|
45 |
+
url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
|
46 |
+
)
|
47 |
+
return
|
48 |
+
|
49 |
+
|
50 |
+
def initialize_driver():
|
51 |
+
"""
|
52 |
+
Initialize the Selenium WebDriver.
|
53 |
+
Returns a configured Chrome WebDriver instance.
|
54 |
+
"""
|
55 |
+
global driver
|
56 |
+
|
57 |
+
# Si le driver existe déjà, on le nettoie d'abord pour éviter les fuites mémoire
|
58 |
+
if driver is not None:
|
59 |
+
close_driver()
|
60 |
+
|
61 |
+
print("Démarrage de l'initialisation du navigateur Chrome...")
|
62 |
+
|
63 |
+
chrome_options = webdriver.ChromeOptions()
|
64 |
+
chrome_options.add_argument("--force-device-scale-factor=1")
|
65 |
+
chrome_options.add_argument("--window-size=1600,1400")
|
66 |
+
chrome_options.add_argument("--disable-pdf-viewer")
|
67 |
+
chrome_options.add_argument("--window-position=0,0")
|
68 |
+
|
69 |
+
# Options essentielles pour l'environnement conteneurisé
|
70 |
+
chrome_options.add_argument("--no-sandbox")
|
71 |
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
72 |
+
chrome_options.add_argument("--disable-gpu")
|
73 |
+
chrome_options.add_argument("--disable-extensions")
|
74 |
+
chrome_options.add_argument("--disable-software-rasterizer")
|
75 |
+
chrome_options.add_argument("--no-first-run")
|
76 |
+
chrome_options.add_argument("--no-zygote")
|
77 |
+
chrome_options.add_argument("--single-process")
|
78 |
+
|
79 |
+
# Path to Chrome binary
|
80 |
+
chrome_path = os.environ.get("CHROME_PATH", "/usr/bin/google-chrome-stable")
|
81 |
+
if os.path.exists(chrome_path):
|
82 |
+
print(f"Utilisation de Chrome à l'emplacement: {chrome_path}")
|
83 |
+
chrome_options.binary_location = chrome_path
|
84 |
+
|
85 |
+
# Afficher les options pour le diagnostic
|
86 |
+
print(f"Options Chrome configurées: {chrome_options.arguments}")
|
87 |
+
|
88 |
+
try:
|
89 |
+
print("Tentative de démarrage de Chrome avec Helium...")
|
90 |
+
driver = helium.start_chrome(headless=True, options=chrome_options)
|
91 |
+
print("Chrome démarré avec succès!")
|
92 |
+
|
93 |
+
# Informations sur le navigateur
|
94 |
+
print(f"Version de Chrome: {driver.capabilities.get('browserVersion', 'Inconnue')}")
|
95 |
+
print(f"Plateforme: {driver.capabilities.get('platformName', 'Inconnue')}")
|
96 |
+
|
97 |
+
# Set page load timeout
|
98 |
+
driver.set_page_load_timeout(30) # Increased to 30 seconds
|
99 |
+
|
100 |
+
return driver
|
101 |
+
except Exception as e:
|
102 |
+
print(f"ERREUR lors du démarrage de Chrome: {str(e)}")
|
103 |
+
# Capturer la trace complète pour diagnostic
|
104 |
+
import traceback
|
105 |
+
print("Trace d'erreur complète:")
|
106 |
+
traceback.print_exc()
|
107 |
+
|
108 |
+
# Vérifier si Chrome est disponible
|
109 |
+
try:
|
110 |
+
import subprocess
|
111 |
+
chrome_version_cmd = f"{chrome_path} --version"
|
112 |
+
version_output = subprocess.check_output(chrome_version_cmd, shell=True, stderr=subprocess.STDOUT).decode()
|
113 |
+
print(f"Version de Chrome installée: {version_output.strip()}")
|
114 |
+
except Exception as chrome_check_error:
|
115 |
+
print(f"Impossible de vérifier la version de Chrome: {str(chrome_check_error)}")
|
116 |
+
|
117 |
+
raise
|
118 |
+
|
119 |
+
|
120 |
+
def close_driver():
|
121 |
+
"""
|
122 |
+
Close the browser and clean up resources.
|
123 |
+
"""
|
124 |
+
global driver
|
125 |
+
|
126 |
+
try:
|
127 |
+
print("Fermeture du navigateur et nettoyage des ressources...")
|
128 |
+
|
129 |
+
# Utiliser helium.kill_browser() pour fermer proprement le navigateur
|
130 |
+
helium.kill_browser()
|
131 |
+
|
132 |
+
# Libérer la référence
|
133 |
+
driver = None
|
134 |
+
|
135 |
+
# Forcer le garbage collector
|
136 |
+
gc.collect()
|
137 |
+
|
138 |
+
print("Navigateur fermé avec succès")
|
139 |
+
except Exception as e:
|
140 |
+
print(f"Error closing browser: {e}")
|
141 |
+
|
142 |
+
|
143 |
+
# Alias de close_driver pour compatibilité avec browser_utils.cleanup_browser
|
144 |
+
def cleanup_browser():
|
145 |
+
"""
|
146 |
+
Alias de close_driver pour compatibilité avec l'API existante.
|
147 |
+
"""
|
148 |
+
close_driver()
|
src/agents/fact_checker/fact_checker_agent.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Agent management for the agent leaderboard parser fact checker.
|
3 |
+
"""
|
src/agents/parser/__pycache__/agent.cpython-310.pyc
ADDED
Binary file (13.1 kB). View file
|
|
src/agents/parser/__pycache__/parser_agent.cpython-310.pyc
ADDED
Binary file (13.2 kB). View file
|
|
src/agents/parser/parser_agent.py
ADDED
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Agent management for the leaderboard parser.
|
3 |
+
"""
|
4 |
+
import datetime
|
5 |
+
from smolagents import CodeAgent
|
6 |
+
from smolagents.cli import load_model
|
7 |
+
|
8 |
+
from src.agents.browser import save_screenshot
|
9 |
+
from src.agents.tools import (
|
10 |
+
map_clickable_elements,
|
11 |
+
close_popups,
|
12 |
+
extract_table_data,
|
13 |
+
find_leaderboard_elements,
|
14 |
+
go_back,
|
15 |
+
search_item_ctrl_f,
|
16 |
+
copy_link_from_element,
|
17 |
+
validate_json_results,
|
18 |
+
find_model_links,
|
19 |
+
click_at_coordinates,
|
20 |
+
)
|
21 |
+
|
22 |
+
|
23 |
+
def initialize_agent(model):
|
24 |
+
"""
|
25 |
+
Initialize the CodeAgent with the specified model.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
model: The LLM model to use for the agent
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
A configured CodeAgent instance
|
32 |
+
"""
|
33 |
+
return CodeAgent(
|
34 |
+
tools=[go_back, map_clickable_elements, validate_json_results, close_popups, search_item_ctrl_f, extract_table_data, find_leaderboard_elements, copy_link_from_element, find_model_links, click_at_coordinates],
|
35 |
+
model=model,
|
36 |
+
additional_authorized_imports=["selenium", "helium", "time", "json", "re"],
|
37 |
+
step_callbacks=[save_screenshot],
|
38 |
+
max_steps=25,
|
39 |
+
verbosity_level=2,
|
40 |
+
)
|
41 |
+
|
42 |
+
|
43 |
+
def get_default_model():
|
44 |
+
"""
|
45 |
+
Get the default model for the agent.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
A configured model instance
|
49 |
+
"""
|
50 |
+
model_type = "LiteLLMModel"
|
51 |
+
model_id = "gpt-4o"
|
52 |
+
return load_model(model_type, model_id)
|
53 |
+
|
54 |
+
|
55 |
+
# Instructions for the agent
|
56 |
+
leaderboard_instructions = """
|
57 |
+
Your task is to extract the three BEST models from the leaderboard. It is crucial that you identify the models that are at the top of the ranking, not just any three models present on the page.
|
58 |
+
|
59 |
+
You must also identify the main criterion on which the models are evaluated (for example: accuracy, speed, performance on a specific benchmark, etc.). Formulate a short description (less than 60 words) that explains what the models are judged on.
|
60 |
+
|
61 |
+
For each model, try to find a link to its page or repository. This can be any link (GitHub, Hugging Face, model website, etc.). If you cannot find a link for a model, indicate null for this field.
|
62 |
+
|
63 |
+
IMPORTANT: If you fail to clearly identify the top three models AND the evaluation criterion, the leaderboard will be rejected. It is essential that you provide this information accurately and completely.
|
64 |
+
|
65 |
+
You can use helium to navigate the website. We have already executed "from helium import *".
|
66 |
+
You can go to pages with:
|
67 |
+
```py
|
68 |
+
go_to('url')
|
69 |
+
```<end_code>
|
70 |
+
|
71 |
+
You can click on clickable elements by entering the text that appears on them:
|
72 |
+
```py
|
73 |
+
click("Button text")
|
74 |
+
```<end_code>
|
75 |
+
|
76 |
+
If it's a link:
|
77 |
+
```py
|
78 |
+
click(Link("Link text"))
|
79 |
+
```<end_code>
|
80 |
+
|
81 |
+
To scroll up or down, use scroll_down or scroll_up with the number of pixels as an argument:
|
82 |
+
```py
|
83 |
+
scroll_down(num_pixels=1200) # This will scroll down one view
|
84 |
+
```<end_code>
|
85 |
+
|
86 |
+
To close popups with an X icon, use the built-in tool `close_popups`:
|
87 |
+
```py
|
88 |
+
close_popups()
|
89 |
+
```<end_code>
|
90 |
+
|
91 |
+
You can use .exists() to check for the existence of an element:
|
92 |
+
```py
|
93 |
+
if Text('Accept cookies?').exists():
|
94 |
+
click('I accept')
|
95 |
+
```<end_code>
|
96 |
+
|
97 |
+
If you encounter situations where you cannot click on elements using text, you can use click_at_coordinates to click at specific x,y coordinates on the page:
|
98 |
+
```py
|
99 |
+
click_at_coordinates(x=500, y=300) # Click at the position 500px from left, 300px from top
|
100 |
+
```<end_code>
|
101 |
+
|
102 |
+
If pages seem stuck while loading, you may need to wait:
|
103 |
+
```py
|
104 |
+
import time
|
105 |
+
time.sleep(20.0) # Wait at least 10 seconds for the initial loading
|
106 |
+
```<end_code>
|
107 |
+
|
108 |
+
To extract data from a table, use the extract_table_data tool:
|
109 |
+
```py
|
110 |
+
table_info = extract_table_data()
|
111 |
+
print(table_info)
|
112 |
+
```<end_code>
|
113 |
+
|
114 |
+
If you cannot easily find a standard table, use find_leaderboard_elements to search for elements that might contain ranking data:
|
115 |
+
```py
|
116 |
+
leaderboard_elements = find_leaderboard_elements()
|
117 |
+
print(leaderboard_elements)
|
118 |
+
```<end_code>
|
119 |
+
|
120 |
+
RECOMMENDED METHODS FOR FINDING MODEL LINKS:
|
121 |
+
|
122 |
+
```py
|
123 |
+
# For a model named "BERT-Large"
|
124 |
+
model_name = "BERT-Large"
|
125 |
+
links_info = find_model_links(model_name)
|
126 |
+
print(links_info)
|
127 |
+
|
128 |
+
# If links were found, the best candidate is displayed at the end of the result
|
129 |
+
if "Best candidate for" in links_info:
|
130 |
+
# Extract the URL of the best candidate
|
131 |
+
best_url_line = links_info.split("Best candidate for")[1].split("\n")[1]
|
132 |
+
url = best_url_line.replace("URL:", "").strip()
|
133 |
+
print(f"URL for model {model_name}: {url}")
|
134 |
+
else:
|
135 |
+
print(f"No link found for model {model_name}")
|
136 |
+
url = None
|
137 |
+
```<end_code>
|
138 |
+
|
139 |
+
|
140 |
+
IMPORTANT: If none of the methods can find a URL, do NOT try other methods such as extracting URLs from the source code. Simply use null for the model URL. It is better to have a missing URL (null) than an incorrect or irrelevant URL.
|
141 |
+
|
142 |
+
IMPORTANT - PAGE EXPLORATION ORDER:
|
143 |
+
If you don't immediately see the leaderboard table or ranking information, STRICTLY follow this order:
|
144 |
+
|
145 |
+
1. ABSOLUTE PRIORITY:
|
146 |
+
Look for and click on buttons, tabs, or links with text like "Leaderboard", "Results", "Ranking", "Benchmark", "Scores", "Evaluation", etc.
|
147 |
+
Examine ALL visible buttons and tabs before moving to the next step.
|
148 |
+
IMPORTANT: Be flexible with text matching! Some elements may contain emojis or other characters before/after the keywords.
|
149 |
+
|
150 |
+
```py
|
151 |
+
# Examples of searching for leaderboard buttons/tabs
|
152 |
+
for text in ["🏆 Leaderboard", "Leaderboard", "Results", "Ranking", "Benchmark", "Scores", "Evaluation", "Performance"]:
|
153 |
+
if Button(text).exists() or Link(text).exists() or Text(text).exists():
|
154 |
+
print(f"Found clickable element: {text}")
|
155 |
+
click(text)
|
156 |
+
time.sleep(5) # Wait for the page to update
|
157 |
+
break
|
158 |
+
|
159 |
+
# If exact matches fail, try more flexible matching
|
160 |
+
# This is crucial for elements with emojis or other characters
|
161 |
+
if True: # This will execute if no exact match was found above
|
162 |
+
print("No exact matches found. Trying flexible text matching...")
|
163 |
+
import time
|
164 |
+
from src.agents.browser import driver
|
165 |
+
from selenium.webdriver.common.by import By
|
166 |
+
|
167 |
+
for text in ["🏆 Leaderboard", "Leaderboard", "Results", "Ranking", "Benchmark", "Scores"]:
|
168 |
+
# Try to find elements CONTAINING the text (not exact match)
|
169 |
+
matching_elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
|
170 |
+
|
171 |
+
if matching_elements:
|
172 |
+
print(f"Found {len(matching_elements)} elements containing '{text}'")
|
173 |
+
for element in matching_elements[:3]: # Try first three matches
|
174 |
+
try:
|
175 |
+
element_text = element.text
|
176 |
+
print(f"Element text: '{element_text}'")
|
177 |
+
driver.execute_script("arguments[0].scrollIntoView(true);", element)
|
178 |
+
time.sleep(1)
|
179 |
+
element.click()
|
180 |
+
print(f"Successfully clicked on element with text: '{element_text}'")
|
181 |
+
time.sleep(5)
|
182 |
+
break
|
183 |
+
except Exception as e:
|
184 |
+
print(f"Could not click: {e}")
|
185 |
+
# Try JavaScript click as fallback
|
186 |
+
try:
|
187 |
+
driver.execute_script("arguments[0].click();", element)
|
188 |
+
print(f"Clicked using JavaScript on element with text: '{element_text}'")
|
189 |
+
time.sleep(5)
|
190 |
+
break
|
191 |
+
except:
|
192 |
+
continue
|
193 |
+
```<end_code>
|
194 |
+
|
195 |
+
2. ONLY AFTER checking all buttons and tabs, scroll down to see if the content is lower down:
|
196 |
+
```py
|
197 |
+
scroll_down(1200) # Try scrolling to see more content
|
198 |
+
```<end_code>
|
199 |
+
|
200 |
+
3. Check if there are dropdown menus or filters to activate
|
201 |
+
4. Explore the different sections of the page
|
202 |
+
|
203 |
+
Proceed step by step:
|
204 |
+
1. Navigate to the provided URL
|
205 |
+
2. Wait for the page to load completely (use time.sleep(20.0))
|
206 |
+
3. EXPLORE the page by STRICTLY following the order above (first buttons/tabs, then scroll if necessary)
|
207 |
+
4. Look for the table or section containing the model ranking
|
208 |
+
5. Identify the three BEST models (those at the top of the ranking) ( DO NOT CHANGE MODEL NAMES UNDER ANY CIRCUMSTANCES )
|
209 |
+
6. Determine the main evaluation criterion for the models
|
210 |
+
7. IMPORTANT : For each identified model, use the method described above to find its URL. If the URL is not found, use null.
|
211 |
+
8. If you cannot find links at the first try for any reason, you can try again with the same method if you want.
|
212 |
+
9. Validate the results using the validate_json_results tool. VERY IMPORTANT TO DO BEFORE SENDING RESULTS.
|
213 |
+
10. Send final results
|
214 |
+
|
215 |
+
```py
|
216 |
+
final_answer({
|
217 |
+
"top_models": [
|
218 |
+
{"rank": 1, "name": "Model name 1", "url": "Model URL or null if not available"},
|
219 |
+
{"rank": 2, "name": "Model name 2", "url": "Model URL or null if not available"},
|
220 |
+
{"rank": 3, "name": "Model name 3", "url": "Model URL or null if not available"}
|
221 |
+
],
|
222 |
+
"evaluation_criteria": "Short description of the evaluation criterion (less than 60 words)"
|
223 |
+
})
|
224 |
+
```<end_code>
|
225 |
+
|
226 |
+
After each block of code you write, you will automatically receive an updated screenshot of the browser and the current URL of the browser.
|
227 |
+
But be careful, the screenshot will only be taken at the end of the complete action, it will not see intermediate states.
|
228 |
+
|
229 |
+
IMPORTANT: DO NOT CHANGE MODEL NAMES UNDER ANY CIRCUMSTANCES
|
230 |
+
"""
|
231 |
+
|
232 |
+
|
233 |
+
def validate_results(result):
|
234 |
+
"""Checks that the results do not contain generic placeholders."""
|
235 |
+
if not result or not isinstance(result, dict):
|
236 |
+
return False, "Invalid result"
|
237 |
+
|
238 |
+
if "top_models" not in result or len(result.get("top_models", [])) < 3:
|
239 |
+
return False, "Less than 3 models found"
|
240 |
+
|
241 |
+
# Check for generic names
|
242 |
+
generic_names = ["model a", "model b", "model c", "model 1", "model 2", "model 3", "model name", "unavailable"]
|
243 |
+
model_names = [m.get("name", "").lower() for m in result.get("top_models", [])]
|
244 |
+
if any(name in generic_names for name in model_names):
|
245 |
+
return False, "Generic model names detected"
|
246 |
+
|
247 |
+
# Check for generic URLs
|
248 |
+
generic_urls = ["example.com", "example.org"]
|
249 |
+
model_urls = [m.get("url", "").lower() for m in result.get("top_models", []) if m.get("url") is not None]
|
250 |
+
if any(generic in url for url in model_urls for generic in generic_urls):
|
251 |
+
return False, "Generic URLs detected"
|
252 |
+
|
253 |
+
# Check the evaluation criterion
|
254 |
+
if "evaluation_criteria" not in result or len(result.get("evaluation_criteria", "")) < 10:
|
255 |
+
return False, "Evaluation criterion missing or too short"
|
256 |
+
|
257 |
+
return True, "Valid results"
|
258 |
+
|
259 |
+
|
260 |
+
def process_leaderboard(url, model, index, uid=None, additional_rules=None):
|
261 |
+
"""
|
262 |
+
Process a single leaderboard URL and return the results.
|
263 |
+
|
264 |
+
Args:
|
265 |
+
url: The URL of the leaderboard to process
|
266 |
+
model: The LLM model to use
|
267 |
+
index: The index of the leaderboard in the list
|
268 |
+
uid: The UID of the leaderboard (for saving screenshots)
|
269 |
+
additional_rules: Additional rules specific to this leaderboard
|
270 |
+
|
271 |
+
Returns:
|
272 |
+
A dictionary with the results or error information
|
273 |
+
"""
|
274 |
+
from src.agents.browser import initialize_driver, close_driver
|
275 |
+
|
276 |
+
print(f"\n\n{'='*50}")
|
277 |
+
print(f"Processing leaderboard {index+1}: {url}")
|
278 |
+
if uid:
|
279 |
+
print(f"UID: {uid}")
|
280 |
+
if additional_rules:
|
281 |
+
print(f"Additional rules: {additional_rules}")
|
282 |
+
print(f"{'='*50}\n")
|
283 |
+
|
284 |
+
# Get current date and time
|
285 |
+
now = datetime.datetime.now()
|
286 |
+
parsed_at = now.isoformat()
|
287 |
+
|
288 |
+
initialize_driver()
|
289 |
+
|
290 |
+
agent = initialize_agent(model)
|
291 |
+
|
292 |
+
# Create the prompt with the target URL
|
293 |
+
prompt = f"Visit {url} and extract the three BEST models from the leaderboard (those at the top of the ranking). Also identify the main evaluation criterion for the models and look for links associated with the models."
|
294 |
+
|
295 |
+
# Add additional rules if provided
|
296 |
+
instructions = leaderboard_instructions
|
297 |
+
if additional_rules:
|
298 |
+
instructions = f"""
|
299 |
+
|
300 |
+
ADDITIONAL RULES SPECIFIC TO THIS LEADERBOARD:
|
301 |
+
{additional_rules}
|
302 |
+
|
303 |
+
{leaderboard_instructions}
|
304 |
+
|
305 |
+
ADDITIONAL RULES SPECIFIC TO THIS LEADERBOARD:
|
306 |
+
{additional_rules}
|
307 |
+
"""
|
308 |
+
|
309 |
+
try:
|
310 |
+
# Run the agent with the provided prompt
|
311 |
+
agent.python_executor("from helium import *")
|
312 |
+
result = agent.run(prompt + instructions)
|
313 |
+
|
314 |
+
print(f"\nResult for {url}:")
|
315 |
+
print(result)
|
316 |
+
|
317 |
+
# Check if the result is None or empty
|
318 |
+
if not result:
|
319 |
+
return {
|
320 |
+
"results": None,
|
321 |
+
"parsing_status": "error",
|
322 |
+
"parsing_message": "Empty result from agent",
|
323 |
+
"parsed_at": parsed_at
|
324 |
+
}
|
325 |
+
|
326 |
+
# Validate the results
|
327 |
+
is_valid, reason = validate_results(result)
|
328 |
+
if not is_valid:
|
329 |
+
print(f"WARNING: {reason}")
|
330 |
+
return {
|
331 |
+
"results": result,
|
332 |
+
"parsing_status": "invalid",
|
333 |
+
"parsing_message": reason,
|
334 |
+
"parsed_at": parsed_at
|
335 |
+
}
|
336 |
+
|
337 |
+
# Make sure the response is in the correct format
|
338 |
+
if not isinstance(result, dict) or "top_models" not in result:
|
339 |
+
print("WARNING: Agent did not use final_answer() correctly")
|
340 |
+
return {
|
341 |
+
"results": None,
|
342 |
+
"parsing_status": "error",
|
343 |
+
"parsing_message": "Agent returned improperly formatted response (did not use final_answer correctly)",
|
344 |
+
"parsed_at": parsed_at
|
345 |
+
}
|
346 |
+
|
347 |
+
return {
|
348 |
+
"results": result,
|
349 |
+
"parsing_status": "success",
|
350 |
+
"parsed_at": parsed_at
|
351 |
+
}
|
352 |
+
except Exception as e:
|
353 |
+
print(f"An error occurred while processing {url}: {e}")
|
354 |
+
return {
|
355 |
+
"results": None,
|
356 |
+
"parsing_status": "error",
|
357 |
+
"parsing_message": str(e),
|
358 |
+
"parsed_at": parsed_at
|
359 |
+
}
|
360 |
+
finally:
|
361 |
+
# Ensure browser is closed
|
362 |
+
close_driver()
|
src/agents/tools.py
ADDED
@@ -0,0 +1,443 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Tools for the leaderboard agent.
|
3 |
+
"""
|
4 |
+
from selenium import webdriver
|
5 |
+
from selenium.webdriver.common.by import By
|
6 |
+
from selenium.webdriver.common.keys import Keys
|
7 |
+
from selenium.webdriver.common.action_chains import ActionChains
|
8 |
+
import re
|
9 |
+
import time
|
10 |
+
import helium
|
11 |
+
|
12 |
+
from smolagents import tool
|
13 |
+
|
14 |
+
|
15 |
+
@tool
|
16 |
+
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
|
17 |
+
"""
|
18 |
+
Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
|
19 |
+
Args:
|
20 |
+
text: The text to search for
|
21 |
+
nth_result: Which occurrence to jump to (default: 1)
|
22 |
+
"""
|
23 |
+
from src.agents.browser import driver
|
24 |
+
|
25 |
+
elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
|
26 |
+
if nth_result > len(elements):
|
27 |
+
raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
|
28 |
+
result = f"Found {len(elements)} matches for '{text}'."
|
29 |
+
elem = elements[nth_result - 1]
|
30 |
+
driver.execute_script("arguments[0].scrollIntoView(true);", elem)
|
31 |
+
result += f"Focused on element {nth_result} of {len(elements)}"
|
32 |
+
return result
|
33 |
+
|
34 |
+
|
35 |
+
@tool
|
36 |
+
def go_back() -> str:
|
37 |
+
"""
|
38 |
+
Navigate back to the previous page.
|
39 |
+
"""
|
40 |
+
from src.agents.browser import driver
|
41 |
+
|
42 |
+
driver.back()
|
43 |
+
time.sleep(2) # Wait for page to load
|
44 |
+
return "Navigated back to previous page"
|
45 |
+
|
46 |
+
|
47 |
+
@tool
|
48 |
+
def close_popups() -> str:
|
49 |
+
"""
|
50 |
+
Closes any popup/modal dialogs that might be open on the page.
|
51 |
+
Useful when pop-ups appear (cookies, login prompts, etc.) that block interaction.
|
52 |
+
"""
|
53 |
+
from src.agents.browser import driver
|
54 |
+
|
55 |
+
# Try to find common popup elements
|
56 |
+
popup_selectors = [
|
57 |
+
"//button[contains(text(), 'Accept')]",
|
58 |
+
"//button[contains(text(), 'Close')]",
|
59 |
+
"//button[contains(text(), 'Fermer')]",
|
60 |
+
"//button[contains(text(), 'OK')]",
|
61 |
+
"//button[contains(text(), 'Got it')]",
|
62 |
+
"//button[contains(@class, 'close')]",
|
63 |
+
"//div[contains(@class, 'popup')]//button",
|
64 |
+
"//div[contains(@class, 'modal')]//button",
|
65 |
+
"//div[contains(@class, 'dialog')]//button"
|
66 |
+
]
|
67 |
+
|
68 |
+
found = False
|
69 |
+
for selector in popup_selectors:
|
70 |
+
try:
|
71 |
+
popup_elements = driver.find_elements(By.XPATH, selector)
|
72 |
+
for elem in popup_elements:
|
73 |
+
if elem.is_displayed():
|
74 |
+
elem.click()
|
75 |
+
found = True
|
76 |
+
time.sleep(0.5) # Wait for popup to disappear
|
77 |
+
except Exception as e:
|
78 |
+
pass # Ignore errors, try next selector
|
79 |
+
|
80 |
+
return "Closed popup dialogs" if found else "No popup dialogs found"
|
81 |
+
|
82 |
+
|
83 |
+
@tool
|
84 |
+
def extract_table_data(table_caption: str = None, table_index: int = 1) -> str:
|
85 |
+
"""
|
86 |
+
Extracts data from a table on the page. Can find a table by caption/title or by index.
|
87 |
+
Args:
|
88 |
+
table_caption: Text in or near the table to find (default: None - will use index)
|
89 |
+
table_index: The index of the table if caption is not provided (1-based)
|
90 |
+
"""
|
91 |
+
from src.agents.browser import driver
|
92 |
+
|
93 |
+
tables = driver.find_elements(By.TAG_NAME, "table")
|
94 |
+
if not tables:
|
95 |
+
return "No tables found on the page."
|
96 |
+
|
97 |
+
result = f"Found {len(tables)} table(s) on the page.\n"
|
98 |
+
|
99 |
+
for i, table in enumerate(tables):
|
100 |
+
result += f"\nTable {i+1}:\n"
|
101 |
+
|
102 |
+
# Try to get headers
|
103 |
+
headers = table.find_elements(By.TAG_NAME, "th")
|
104 |
+
if headers:
|
105 |
+
header_texts = [header.text for header in headers]
|
106 |
+
result += f"Headers: {', '.join(header_texts)}\n"
|
107 |
+
|
108 |
+
# Get rows
|
109 |
+
rows = table.find_elements(By.TAG_NAME, "tr")
|
110 |
+
result += f"Found {len(rows)} rows.\n"
|
111 |
+
|
112 |
+
# Get first 5 rows as sample
|
113 |
+
for j, row in enumerate(rows[:5]):
|
114 |
+
cells = row.find_elements(By.TAG_NAME, "td")
|
115 |
+
if cells:
|
116 |
+
cell_texts = [cell.text for cell in cells]
|
117 |
+
result += f"Row {j+1}: {' | '.join(cell_texts)}\n"
|
118 |
+
|
119 |
+
return result
|
120 |
+
|
121 |
+
|
122 |
+
@tool
|
123 |
+
def find_leaderboard_elements() -> str:
|
124 |
+
"""
|
125 |
+
Find key elements of a leaderboard: title, evaluation criteria, and model rankings.
|
126 |
+
Returns a structured description of what was found.
|
127 |
+
"""
|
128 |
+
from src.agents.browser import driver
|
129 |
+
|
130 |
+
result = ""
|
131 |
+
|
132 |
+
# Check for tables first
|
133 |
+
tables = driver.find_elements(By.TAG_NAME, "table")
|
134 |
+
if tables:
|
135 |
+
result += f"Found {len(tables)} table(s) that might contain leaderboard data.\n"
|
136 |
+
|
137 |
+
# Check for ordered lists
|
138 |
+
ol_elements = driver.find_elements(By.TAG_NAME, "ol")
|
139 |
+
if ol_elements:
|
140 |
+
result += f"Found {len(ol_elements)} ordered list(s) that might contain rankings.\n"
|
141 |
+
|
142 |
+
# Check for div elements with grid or flex display that might be custom leaderboards
|
143 |
+
grid_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'grid') or contains(@class, 'flex') or contains(@class, 'table') or contains(@class, 'rank') or contains(@class, 'leaderboard')]")
|
144 |
+
if grid_elements:
|
145 |
+
result += f"Found {len(grid_elements)} div elements with grid/flex/table classes that might be custom leaderboards.\n"
|
146 |
+
|
147 |
+
# Look for elements with rank or position indicators
|
148 |
+
rank_elements = driver.find_elements(By.XPATH, "//*[contains(@class, 'rank') or contains(@class, 'position') or contains(@class, 'standing')]")
|
149 |
+
if rank_elements:
|
150 |
+
result += f"Found {len(rank_elements)} elements with rank/position classes.\n"
|
151 |
+
|
152 |
+
if not result:
|
153 |
+
return "Could not find any obvious leaderboard elements. Try scrolling or navigating to the correct section."
|
154 |
+
|
155 |
+
return result
|
156 |
+
|
157 |
+
@tool
|
158 |
+
def map_clickable_elements(keyword: str = None) -> str:
|
159 |
+
"""
|
160 |
+
Displays a list of all clickable elements on the page with their coordinates.
|
161 |
+
|
162 |
+
Args:
|
163 |
+
keyword: Optional keyword to filter elements. If specified, only elements containing this keyword will be displayed.
|
164 |
+
|
165 |
+
Returns:
|
166 |
+
A string listing all clickable elements with their coordinates.
|
167 |
+
"""
|
168 |
+
from src.agents.browser import driver
|
169 |
+
|
170 |
+
clickable_selectors = [
|
171 |
+
"a", "button", "input[type='button']", "input[type='submit']",
|
172 |
+
".clickable", "[role='button']", "[onclick]"
|
173 |
+
]
|
174 |
+
|
175 |
+
result = "Éléments cliquables détectés:\n"
|
176 |
+
total = 0
|
177 |
+
|
178 |
+
for selector in clickable_selectors:
|
179 |
+
elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
180 |
+
for i, element in enumerate(elements):
|
181 |
+
try:
|
182 |
+
text = element.text.strip()
|
183 |
+
if not text and element.get_attribute("value"):
|
184 |
+
text = element.get_attribute("value")
|
185 |
+
|
186 |
+
# Ignorer les éléments vides ou non visibles
|
187 |
+
if not text or not element.is_displayed():
|
188 |
+
continue
|
189 |
+
|
190 |
+
# Filtrer par mot-clé si spécifié
|
191 |
+
if keyword and keyword.lower() not in text.lower():
|
192 |
+
continue
|
193 |
+
|
194 |
+
rect = element.rect
|
195 |
+
x = int(rect['x'] + rect['width']/2)
|
196 |
+
y = int(rect['y'] + rect['height']/2)
|
197 |
+
|
198 |
+
result += f"{total+1}. '{text}' ({selector}) - coords: x={x}, y={y}\n"
|
199 |
+
total += 1
|
200 |
+
except:
|
201 |
+
continue
|
202 |
+
|
203 |
+
result += f"\nTotal: {total} éléments cliquables" + (" contenant '" + keyword + "'" if keyword else "")
|
204 |
+
return result
|
205 |
+
|
206 |
+
@tool
|
207 |
+
def copy_link_from_element(text_to_find: str, link_position: int = 1) -> str:
|
208 |
+
"""
|
209 |
+
Find elements with specified text and return the URL if it's a link or has a parent link.
|
210 |
+
Args:
|
211 |
+
text_to_find: Text to search for
|
212 |
+
link_position: If multiple matches, which one to use (1-based)
|
213 |
+
"""
|
214 |
+
from src.agents.browser import driver
|
215 |
+
|
216 |
+
try:
|
217 |
+
# Try to find an element with the given text
|
218 |
+
element = driver.find_element_by_xpath(f"//*[contains(text(), '{text_to_find}')]")
|
219 |
+
if not element:
|
220 |
+
return f"No element containing the text '{text_to_find}' was found."
|
221 |
+
|
222 |
+
# Try to find URL directly from the element
|
223 |
+
href = element.get_attribute("href")
|
224 |
+
if href:
|
225 |
+
return f"URL found: {href}"
|
226 |
+
|
227 |
+
# Try to find a parent that is a link
|
228 |
+
parent = element.find_element_by_xpath("./ancestor::a")
|
229 |
+
if parent:
|
230 |
+
href = parent.get_attribute("href")
|
231 |
+
if href:
|
232 |
+
return f"URL found in parent element: {href}"
|
233 |
+
|
234 |
+
# Try to find a child that is a link
|
235 |
+
child = element.find_element_by_xpath(".//a")
|
236 |
+
if child:
|
237 |
+
href = child.get_attribute("href")
|
238 |
+
if href:
|
239 |
+
return f"URL found in child element: {href}"
|
240 |
+
|
241 |
+
# Méthode 4: Essayer le clic droit et "Copier l'adresse du lien"
|
242 |
+
actions = ActionChains(driver)
|
243 |
+
actions.context_click(element).perform()
|
244 |
+
|
245 |
+
# Attendre un peu pour que le menu contextuel s'affiche
|
246 |
+
import time
|
247 |
+
time.sleep(1)
|
248 |
+
|
249 |
+
# Essayer de trouver et cliquer sur "Copier l'adresse du lien" ou équivalent
|
250 |
+
# Note: Cette partie est très dépendante du navigateur et de la langue
|
251 |
+
copy_link_texts = ["Copy link address", "Copier l'adresse du lien", "Copy Link", "Copier le lien"]
|
252 |
+
|
253 |
+
for text in copy_link_texts:
|
254 |
+
try:
|
255 |
+
link_option = driver.find_element(By.XPATH, f"//div[contains(text(), '{text}')]")
|
256 |
+
link_option.click()
|
257 |
+
return f"Action 'Copier l'adresse du lien' effectuée pour '{text_to_find}'"
|
258 |
+
except:
|
259 |
+
continue
|
260 |
+
|
261 |
+
# Annuler le menu contextuel
|
262 |
+
webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
|
263 |
+
|
264 |
+
return f"Impossible de trouver un lien pour l'élément '{text_to_find}' avec les méthodes disponibles."
|
265 |
+
|
266 |
+
except Exception as e:
|
267 |
+
return f"Erreur lors de la recherche du lien: {str(e)}"
|
268 |
+
|
269 |
+
@tool
|
270 |
+
def validate_json_results(result: dict) -> tuple[bool, str]:
|
271 |
+
"""
|
272 |
+
Checks that the results do not contain generic placeholders.
|
273 |
+
Args:
|
274 |
+
result: The result to validate
|
275 |
+
Returns:
|
276 |
+
A tuple containing a boolean indicating if the result is valid and a message
|
277 |
+
explaining why the result is invalid if it is not valid.
|
278 |
+
"""
|
279 |
+
if not result or not isinstance(result, dict):
|
280 |
+
return False, "Invalid result"
|
281 |
+
|
282 |
+
if "top_models" not in result or len(result.get("top_models", [])) < 3:
|
283 |
+
return False, "Less than 3 models found"
|
284 |
+
|
285 |
+
# Check for duplicate models
|
286 |
+
seen_models = set()
|
287 |
+
for model in result.get("top_models", []):
|
288 |
+
model_name = model.get("name", "").lower()
|
289 |
+
if model_name in seen_models:
|
290 |
+
return False, f"Duplicate model '{model.get('name')}' found. Please ensure each model is unique."
|
291 |
+
seen_models.add(model_name)
|
292 |
+
|
293 |
+
# Check for generic names
|
294 |
+
generic_names = ["model a", "model b", "model c", "model 1", "model 2", "model 3", "model name", "unavailable"]
|
295 |
+
model_names = [m.get("name", "").lower() for m in result.get("top_models", [])]
|
296 |
+
if any(name in generic_names for name in model_names):
|
297 |
+
return False, "Generic model names detected"
|
298 |
+
|
299 |
+
# Check for unwanted suffixes in model names
|
300 |
+
unwanted_suffix_pattern = r"\(.*\)$"
|
301 |
+
for model in result.get("top_models", []):
|
302 |
+
if re.search(unwanted_suffix_pattern, model.get("name", "")):
|
303 |
+
return False, f"Model name '{model.get('name')}' contains unwanted suffixes. Please remove them if you think they are not part of the model name. If it's a version number or a date, keep it."
|
304 |
+
|
305 |
+
# Check for generic URLs
|
306 |
+
generic_urls = ["example.com", "example.org"]
|
307 |
+
model_urls = [m.get("url", "").lower() for m in result.get("top_models", []) if m.get("url") is not None]
|
308 |
+
if any(generic in url for url in model_urls for generic in generic_urls):
|
309 |
+
return False, "Generic URLs detected"
|
310 |
+
|
311 |
+
# Check for submatch between model name and URL
|
312 |
+
for model in result.get("top_models", []):
|
313 |
+
name = model.get("name", "").lower()
|
314 |
+
url = model.get("url")
|
315 |
+
|
316 |
+
# Skip validation if URL is None or empty - this is acceptable, so no warning
|
317 |
+
if not url:
|
318 |
+
continue
|
319 |
+
|
320 |
+
url = url.lower()
|
321 |
+
if url and not any(name[i:i+4] in url for i in range(len(name) - 3)):
|
322 |
+
return False, f"URL for model '{model.get('name')}' does not have a valid submatch with the name. This is probably a wrong URL. Please check the URL and try again."
|
323 |
+
|
324 |
+
# Check the evaluation criterion
|
325 |
+
if "evaluation_criteria" not in result or len(result.get("evaluation_criteria", "")) < 10:
|
326 |
+
return False, "Evaluation criterion missing or too short"
|
327 |
+
|
328 |
+
return True, "Valid results"
|
329 |
+
|
330 |
+
@tool
|
331 |
+
def find_model_links(model_name: str) -> str:
|
332 |
+
"""
|
333 |
+
Search for links that might point to a model based on their URL
|
334 |
+
and their match with the model name.
|
335 |
+
Args:
|
336 |
+
model_name: The name of the model to search for
|
337 |
+
|
338 |
+
Returns:
|
339 |
+
A list of potential links to the model
|
340 |
+
"""
|
341 |
+
from src.agents.browser import driver
|
342 |
+
try:
|
343 |
+
# 1. Retrieve all links on the page
|
344 |
+
all_links = driver.find_elements(By.TAG_NAME, "a")
|
345 |
+
if not all_links:
|
346 |
+
return "No links were found on the page."
|
347 |
+
|
348 |
+
# 2. Known patterns for model repositories
|
349 |
+
model_url_patterns = [
|
350 |
+
r'huggingface\.co/[^/]+/[^/]+', # Hugging Face model repo
|
351 |
+
r'github\.com/[^/]+/[^/]+', # GitHub repo
|
352 |
+
]
|
353 |
+
|
354 |
+
model_links = []
|
355 |
+
model_name_lower = model_name.lower()
|
356 |
+
|
357 |
+
for link in all_links:
|
358 |
+
try:
|
359 |
+
# Check if the link is visible and has an href attribute
|
360 |
+
if not link.is_displayed() or not link.get_attribute('href'):
|
361 |
+
continue
|
362 |
+
|
363 |
+
link_url = link.get_attribute('href')
|
364 |
+
link_text = link.text.strip()
|
365 |
+
|
366 |
+
# Ignore links to non-relevant resources
|
367 |
+
if link_url.endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.ico', '.css', '.js')):
|
368 |
+
continue
|
369 |
+
|
370 |
+
# Check if the URL matches a known pattern
|
371 |
+
matches_pattern = any(re.search(pattern, link_url, re.IGNORECASE) for pattern in model_url_patterns)
|
372 |
+
|
373 |
+
if matches_pattern:
|
374 |
+
# Check for a 3-character submatch between the model name and the URL
|
375 |
+
url_lower = link_url.lower()
|
376 |
+
has_submatch = False
|
377 |
+
|
378 |
+
# Search for a 3-character submatch in the model name
|
379 |
+
for i in range(len(model_name_lower) - 4):
|
380 |
+
if model_name_lower[i:i+5] in url_lower and model_name_lower[i:i+5] in link_text.lower():
|
381 |
+
has_submatch = True
|
382 |
+
break
|
383 |
+
|
384 |
+
if has_submatch:
|
385 |
+
# Calculate the confidence based on character matches
|
386 |
+
confidence = sum(1 for c in model_name_lower if c in link_text.lower())
|
387 |
+
model_links.append({
|
388 |
+
'url': link_url,
|
389 |
+
'text': link_text,
|
390 |
+
'confidence': confidence
|
391 |
+
})
|
392 |
+
except Exception as e:
|
393 |
+
continue # Ignore errors and continue
|
394 |
+
|
395 |
+
# 3. Format the result
|
396 |
+
if not model_links:
|
397 |
+
return f"No potential links to the model '{model_name}' were found."
|
398 |
+
|
399 |
+
result = f"Found {len(model_links)} potential links for the model '{model_name}':\n\n"
|
400 |
+
|
401 |
+
for i, link in enumerate(model_links):
|
402 |
+
result += f"Candidate {i+1}:\n"
|
403 |
+
result += f"URL: {link['url']}\n"
|
404 |
+
result += f"Text: {link['text']}\n"
|
405 |
+
result += f"Confidence: {link['confidence']}\n\n"
|
406 |
+
|
407 |
+
# 4. Suggest the best candidate (the one with the highest confidence)
|
408 |
+
if model_links:
|
409 |
+
best_candidate = max(model_links, key=lambda x: x['confidence'])
|
410 |
+
result += f"Best candidate for '{model_name}':\nURL: {best_candidate['url']}\nText: {best_candidate['text']} "
|
411 |
+
|
412 |
+
return result
|
413 |
+
except Exception as e:
|
414 |
+
return f"Error while searching for links for the model '{model_name}': {str(e)}"
|
415 |
+
|
416 |
+
@tool
|
417 |
+
def click_at_coordinates(x: int, y: int) -> str:
|
418 |
+
"""
|
419 |
+
Clicks at the specified x,y coordinates on the page.
|
420 |
+
This is useful when other targeting methods fail or when dealing with complex UI elements.
|
421 |
+
|
422 |
+
Args:
|
423 |
+
x: The x-coordinate to click at
|
424 |
+
y: The y-coordinate to click at
|
425 |
+
|
426 |
+
Returns:
|
427 |
+
A message confirming the click action
|
428 |
+
"""
|
429 |
+
from src.agents.browser import driver
|
430 |
+
|
431 |
+
try:
|
432 |
+
# Using ActionChains for precise coordinate clicks
|
433 |
+
actions = ActionChains(driver)
|
434 |
+
actions.move_by_offset(x, y).click().perform()
|
435 |
+
actions.reset_actions() # Reset position after click
|
436 |
+
|
437 |
+
# Alternative approach using Helium
|
438 |
+
# helium.click_at_point(x, y)
|
439 |
+
|
440 |
+
time.sleep(1) # Wait a moment for any reactions to the click
|
441 |
+
return f"Successfully clicked at coordinates ({x}, {y})"
|
442 |
+
except Exception as e:
|
443 |
+
return f"Failed to click at coordinates ({x}, {y}): {str(e)}"
|
src/file_utils.py
ADDED
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Utilities for file management.
|
3 |
+
"""
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
import datetime
|
7 |
+
import shutil
|
8 |
+
import time
|
9 |
+
import random
|
10 |
+
import tempfile
|
11 |
+
import logging
|
12 |
+
from filelock import FileLock
|
13 |
+
|
14 |
+
logger = logging.getLogger("leaderboard-parser")
|
15 |
+
|
16 |
+
def save_results(results, file_path):
|
17 |
+
"""
|
18 |
+
Save results to a JSON file.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
results: The results to save
|
22 |
+
file_path: The path to the file
|
23 |
+
"""
|
24 |
+
with open(file_path, "w") as f:
|
25 |
+
json.dump(results, f, indent=2)
|
26 |
+
|
27 |
+
|
28 |
+
def create_category_slug(category_name):
|
29 |
+
"""
|
30 |
+
Creates a slug from a category name.
|
31 |
+
The slug uses only hyphens as separators (no underscore).
|
32 |
+
|
33 |
+
Args:
|
34 |
+
category_name: The category name
|
35 |
+
|
36 |
+
Returns:
|
37 |
+
The category slug
|
38 |
+
"""
|
39 |
+
if not category_name:
|
40 |
+
return ""
|
41 |
+
# Convert to lowercase and replace spaces with hyphens
|
42 |
+
# Ensure no underscores are used in the category slug
|
43 |
+
return category_name.lower().replace(" ", "-").replace("_", "-")
|
44 |
+
|
45 |
+
|
46 |
+
def create_combined_id(category, uid):
|
47 |
+
"""
|
48 |
+
Creates a normalized combined identifier from a category and UID.
|
49 |
+
First normalizes the category using create_category_slug.
|
50 |
+
|
51 |
+
Args:
|
52 |
+
category: The category name
|
53 |
+
uid: The UID of the leaderboard
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
The combined identifier in the format category_slug_uid
|
57 |
+
"""
|
58 |
+
normalized_category = create_category_slug(category)
|
59 |
+
return f"{normalized_category}_{uid}"
|
60 |
+
|
61 |
+
|
62 |
+
def validate_leaderboard_result(result):
|
63 |
+
"""
|
64 |
+
Validates and corrects if necessary a leaderboard result to ensure identifier consistency.
|
65 |
+
|
66 |
+
This function checks:
|
67 |
+
1. That 'uid' is present and correctly formatted (category_original_uid)
|
68 |
+
2. That 'original_uid' is present
|
69 |
+
3. That 'category' is present and normalized
|
70 |
+
4. That 'uid' corresponds to the combination of category and original_uid
|
71 |
+
|
72 |
+
Args:
|
73 |
+
result: The leaderboard result to validate (dict)
|
74 |
+
|
75 |
+
Returns:
|
76 |
+
The validated and corrected result, or None if validation is impossible
|
77 |
+
"""
|
78 |
+
if not isinstance(result, dict):
|
79 |
+
logger.error(f"Validation error: the result is not a dictionary")
|
80 |
+
return None
|
81 |
+
|
82 |
+
# Check if required fields are present
|
83 |
+
if "original_uid" not in result:
|
84 |
+
logger.error(f"Validation error: original_uid missing from result")
|
85 |
+
return None
|
86 |
+
|
87 |
+
if "category" not in result:
|
88 |
+
logger.error(f"Validation error: category missing from result")
|
89 |
+
return None
|
90 |
+
|
91 |
+
original_uid = result["original_uid"]
|
92 |
+
category = result["category"]
|
93 |
+
|
94 |
+
# Normalize the category if necessary
|
95 |
+
normalized_category = create_category_slug(category)
|
96 |
+
if normalized_category != category:
|
97 |
+
logger.warning(f"Category not normalized: '{category}' -> '{normalized_category}'")
|
98 |
+
result["category"] = normalized_category
|
99 |
+
|
100 |
+
# Recalculate the correct combined uid
|
101 |
+
correct_uid = create_combined_id(normalized_category, original_uid)
|
102 |
+
|
103 |
+
# Check if existing uid is correct
|
104 |
+
if "uid" not in result:
|
105 |
+
logger.warning(f"uid missing, adding calculated uid: {correct_uid}")
|
106 |
+
result["uid"] = correct_uid
|
107 |
+
elif result["uid"] != correct_uid:
|
108 |
+
logger.warning(f"uid inconsistent: '{result['uid']}' does not match '{correct_uid}', correction applied")
|
109 |
+
result["uid"] = correct_uid
|
110 |
+
|
111 |
+
return result
|
112 |
+
|
113 |
+
|
114 |
+
def load_and_validate_results(file_path):
|
115 |
+
"""
|
116 |
+
Loads results from the file without strict validation.
|
117 |
+
|
118 |
+
Args:
|
119 |
+
file_path: Path to the results file
|
120 |
+
|
121 |
+
Returns:
|
122 |
+
List of results, or empty list in case of error
|
123 |
+
"""
|
124 |
+
try:
|
125 |
+
# Load results from the file
|
126 |
+
try:
|
127 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
128 |
+
results_data = json.load(f)
|
129 |
+
except (FileNotFoundError, json.JSONDecodeError) as e:
|
130 |
+
logger.warning(f"Unable to load file {file_path}: {str(e)}")
|
131 |
+
return []
|
132 |
+
|
133 |
+
# Convert from dict with "leaderboards" to array if necessary
|
134 |
+
if isinstance(results_data, dict) and "leaderboards" in results_data:
|
135 |
+
array_results = []
|
136 |
+
for uid, item in results_data["leaderboards"].items():
|
137 |
+
item_copy = item.copy()
|
138 |
+
item_copy["uid"] = uid
|
139 |
+
array_results.append(item_copy)
|
140 |
+
results_data = array_results
|
141 |
+
|
142 |
+
# Ensure results_data is a list
|
143 |
+
if not isinstance(results_data, list):
|
144 |
+
logger.warning(f"Invalid data format in {file_path}, initializing empty list")
|
145 |
+
return []
|
146 |
+
|
147 |
+
# Sort results
|
148 |
+
results_data.sort(key=lambda x: (x.get("category", ""), x.get("original_uid", "")))
|
149 |
+
|
150 |
+
logger.info(f"Load successful: {len(results_data)} results")
|
151 |
+
return results_data
|
152 |
+
|
153 |
+
except Exception as e:
|
154 |
+
logger.error(f"Error loading results: {str(e)}")
|
155 |
+
return []
|
156 |
+
|
157 |
+
|
158 |
+
def update_leaderboard_result(leaderboard_result, file_path, max_wait_seconds=30):
|
159 |
+
"""
|
160 |
+
Updates a leaderboard result in the specified file.
|
161 |
+
If an entry with the same uid already exists, it is updated.
|
162 |
+
Otherwise, a new entry is added.
|
163 |
+
|
164 |
+
Args:
|
165 |
+
leaderboard_result: The leaderboard result to update (must contain a uid)
|
166 |
+
file_path: Path to the results file
|
167 |
+
max_wait_seconds: Maximum wait time for file lock (in seconds)
|
168 |
+
|
169 |
+
Returns:
|
170 |
+
Updated results list or None in case of error
|
171 |
+
"""
|
172 |
+
if not leaderboard_result or "uid" not in leaderboard_result:
|
173 |
+
logger.error("Unable to update: invalid or missing leaderboard result or uid")
|
174 |
+
return None
|
175 |
+
|
176 |
+
# Create parent directory if necessary
|
177 |
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
178 |
+
|
179 |
+
# Use a lock to avoid concurrent writes
|
180 |
+
lock_path = f"{file_path}.lock"
|
181 |
+
lock = FileLock(lock_path, timeout=max_wait_seconds)
|
182 |
+
|
183 |
+
try:
|
184 |
+
with lock:
|
185 |
+
# Load existing results
|
186 |
+
current_results = load_and_validate_results(file_path)
|
187 |
+
|
188 |
+
# Index by uid for easy update
|
189 |
+
results_by_uid = {r.get("uid", ""): r for r in current_results if "uid" in r}
|
190 |
+
|
191 |
+
# Update or add result
|
192 |
+
uid = leaderboard_result["uid"]
|
193 |
+
if uid in results_by_uid:
|
194 |
+
# Update existing result
|
195 |
+
results_by_uid[uid].update(leaderboard_result)
|
196 |
+
logger.info(f"Result updated for uid: {uid}")
|
197 |
+
else:
|
198 |
+
# Add new result
|
199 |
+
results_by_uid[uid] = leaderboard_result
|
200 |
+
logger.info(f"New result added for uid: {uid}")
|
201 |
+
|
202 |
+
# Convert to list for writing
|
203 |
+
updated_results = list(results_by_uid.values())
|
204 |
+
|
205 |
+
# Sort results
|
206 |
+
updated_results.sort(key=lambda x: (x.get("category", ""), x.get("original_uid", "")))
|
207 |
+
|
208 |
+
# Write to temporary file then rename for atomicity
|
209 |
+
fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(file_path))
|
210 |
+
try:
|
211 |
+
with os.fdopen(fd, 'w', encoding='utf-8') as f:
|
212 |
+
json.dump(updated_results, f, indent=2, ensure_ascii=False)
|
213 |
+
|
214 |
+
# Replace original file with temporary file
|
215 |
+
shutil.move(temp_path, file_path)
|
216 |
+
logger.info(f"File updated successfully: {file_path}")
|
217 |
+
|
218 |
+
return updated_results
|
219 |
+
except Exception as e:
|
220 |
+
# Clean up in case of error
|
221 |
+
if os.path.exists(temp_path):
|
222 |
+
os.unlink(temp_path)
|
223 |
+
raise e
|
224 |
+
|
225 |
+
except Exception as e:
|
226 |
+
logger.error(f"Error updating file {file_path}: {str(e)}")
|
227 |
+
return None
|
228 |
+
|
229 |
+
|
230 |
+
def split_combined_id(combined_id):
|
231 |
+
"""
|
232 |
+
Splits a combined identifier (category_uid) into its components.
|
233 |
+
Uses only the first underscore "_" as separator.
|
234 |
+
|
235 |
+
Args:
|
236 |
+
combined_id: The combined identifier (category_uid)
|
237 |
+
|
238 |
+
Returns:
|
239 |
+
A tuple (category, uid) or (None, combined_id) if no underscore
|
240 |
+
"""
|
241 |
+
if not combined_id:
|
242 |
+
return None, None
|
243 |
+
|
244 |
+
# Search for the first underscore to separate category and uid
|
245 |
+
parts = combined_id.split("_", 1)
|
246 |
+
if len(parts) == 2:
|
247 |
+
return parts[0], parts[1]
|
248 |
+
else:
|
249 |
+
# If no underscore, consider it as just a uid without category
|
250 |
+
return None, combined_id
|
251 |
+
|
252 |
+
|
253 |
+
def format_datetime(dt_str):
|
254 |
+
"""
|
255 |
+
Format a datetime string to a human readable format.
|
256 |
+
|
257 |
+
Args:
|
258 |
+
dt_str: The datetime string to format
|
259 |
+
|
260 |
+
Returns:
|
261 |
+
A formatted datetime string
|
262 |
+
"""
|
263 |
+
try:
|
264 |
+
# Check if input is already a datetime object
|
265 |
+
if isinstance(dt_str, datetime.datetime):
|
266 |
+
dt = dt_str
|
267 |
+
else:
|
268 |
+
# Convert ISO format to datetime object
|
269 |
+
# Handle different formats of ISO dates including fractional seconds and timezone
|
270 |
+
try:
|
271 |
+
dt = datetime.datetime.fromisoformat(dt_str)
|
272 |
+
except ValueError:
|
273 |
+
# Handle other common formats
|
274 |
+
formats = [
|
275 |
+
"%Y-%m-%dT%H:%M:%S.%f%z",
|
276 |
+
"%Y-%m-%dT%H:%M:%S.%f",
|
277 |
+
"%Y-%m-%dT%H:%M:%S%z",
|
278 |
+
"%Y-%m-%dT%H:%M:%S",
|
279 |
+
"%Y-%m-%d %H:%M:%S",
|
280 |
+
"%Y-%m-%d"
|
281 |
+
]
|
282 |
+
|
283 |
+
for fmt in formats:
|
284 |
+
try:
|
285 |
+
dt = datetime.datetime.strptime(dt_str, fmt)
|
286 |
+
break
|
287 |
+
except ValueError:
|
288 |
+
continue
|
289 |
+
else:
|
290 |
+
# If no format matches
|
291 |
+
return dt_str
|
292 |
+
|
293 |
+
# Format the datetime object
|
294 |
+
return dt.strftime("%d/%m/%Y à %H:%M:%S")
|
295 |
+
except (ValueError, TypeError) as e:
|
296 |
+
print(f"Error formatting date {dt_str}: {e}")
|
297 |
+
return dt_str
|
298 |
+
|
299 |
+
|
300 |
+
def clean_output_files(results_file):
|
301 |
+
"""
|
302 |
+
Clean the output files, but keep a backup of the original.
|
303 |
+
|
304 |
+
Args:
|
305 |
+
results_file: The results file to clean
|
306 |
+
"""
|
307 |
+
# If results file exists, make a backup
|
308 |
+
if os.path.exists(results_file):
|
309 |
+
backup_file = f"{results_file}.backup"
|
310 |
+
shutil.copy2(results_file, backup_file)
|
311 |
+
print(f"Backup of {results_file} created in {backup_file}")
|
312 |
+
|
313 |
+
# Create an empty results file
|
314 |
+
with open(results_file, "w") as f:
|
315 |
+
json.dump([], f, indent=2)
|
316 |
+
print(f"File {results_file} cleaned")
|
src/hub_utils.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Utilities for interacting with the Hugging Face Hub.
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
from huggingface_hub import HfApi, login, hf_hub_download
|
6 |
+
|
7 |
+
|
8 |
+
def upload_to_hub(to_parse_file, results_file, repo_id=None):
|
9 |
+
"""
|
10 |
+
Uploads files to the Hugging Face Hub.
|
11 |
+
|
12 |
+
Args:
|
13 |
+
to_parse_file: Path to the categories file
|
14 |
+
results_file: Path to the results file
|
15 |
+
repo_id: Hub repository ID
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
True if upload succeeded, False otherwise
|
19 |
+
"""
|
20 |
+
try:
|
21 |
+
# Use environment variable HUGGING_FACE_STORAGE_REPO if available
|
22 |
+
# Otherwise, use default value
|
23 |
+
if repo_id is None:
|
24 |
+
repo_id = os.getenv("HUGGING_FACE_STORAGE_REPO", "leaderboard-explorer/leaderboard_explorer")
|
25 |
+
if os.getenv("HUGGING_FACE_STORAGE_REPO"):
|
26 |
+
print(f"Using target dataset specified in HUGGING_FACE_STORAGE_REPO: {repo_id}")
|
27 |
+
else:
|
28 |
+
print(f"No target dataset specified, using default value: {repo_id}")
|
29 |
+
|
30 |
+
# Check if token is available
|
31 |
+
token = os.getenv("HUGGING_FACE_HUB_TOKEN")
|
32 |
+
if not token:
|
33 |
+
print("ERROR: Environment variable HUGGING_FACE_HUB_TOKEN is not defined.")
|
34 |
+
return False
|
35 |
+
|
36 |
+
# Connect to Hub
|
37 |
+
print("Connecting to Hugging Face Hub...")
|
38 |
+
login(token=token)
|
39 |
+
api = HfApi()
|
40 |
+
|
41 |
+
# Upload JSON files
|
42 |
+
print(f"\n--- UPLOADING CATEGORIES FILE ---")
|
43 |
+
print(f"Local file: {to_parse_file}")
|
44 |
+
print(f"Destination: {repo_id}/best_model_for_category_list.json")
|
45 |
+
print(f"Uploading...")
|
46 |
+
|
47 |
+
try:
|
48 |
+
api.upload_file(
|
49 |
+
path_or_fileobj=to_parse_file,
|
50 |
+
path_in_repo="best_model_for_category_list.json",
|
51 |
+
repo_id=repo_id,
|
52 |
+
repo_type="dataset",
|
53 |
+
commit_message="Update leaderboard categories"
|
54 |
+
)
|
55 |
+
print(f"Upload of {to_parse_file} successful!")
|
56 |
+
except Exception as e:
|
57 |
+
print(f"Note when uploading {to_parse_file}: {e}")
|
58 |
+
if "No files have been modified since last commit" in str(e):
|
59 |
+
print("→ The categories file is identical to the one already on the Hub. No changes needed.")
|
60 |
+
else:
|
61 |
+
print(f"→ ERROR: Upload failed for another reason.")
|
62 |
+
raise e
|
63 |
+
|
64 |
+
print(f"\n--- UPLOADING RESULTS FILE ---")
|
65 |
+
print(f"Local file: {results_file}")
|
66 |
+
print(f"Destination: {repo_id}/best_model_for_results.json")
|
67 |
+
print(f"Uploading...")
|
68 |
+
|
69 |
+
try:
|
70 |
+
api.upload_file(
|
71 |
+
path_or_fileobj=results_file,
|
72 |
+
path_in_repo="best_model_for_results.json",
|
73 |
+
repo_id=repo_id,
|
74 |
+
repo_type="dataset",
|
75 |
+
commit_message="Update leaderboard results"
|
76 |
+
)
|
77 |
+
print(f"Upload of {results_file} successful!")
|
78 |
+
except Exception as e:
|
79 |
+
print(f"Note when uploading {results_file}: {e}")
|
80 |
+
if "No files have been modified since last commit" in str(e):
|
81 |
+
print("→ The results file is identical to the one already on the Hub. No changes needed.")
|
82 |
+
else:
|
83 |
+
print(f"→ ERROR: Upload failed for another reason.")
|
84 |
+
raise e
|
85 |
+
|
86 |
+
print(f"\nUpload operation completed: files have been processed!")
|
87 |
+
return True
|
88 |
+
except Exception as e:
|
89 |
+
print(f"GENERAL ERROR during file upload to Hub: {e}")
|
90 |
+
return False
|
91 |
+
|
92 |
+
|
93 |
+
def download_from_hub(repo_id=None):
|
94 |
+
"""
|
95 |
+
Downloads files from the Hugging Face Hub.
|
96 |
+
|
97 |
+
Args:
|
98 |
+
repo_id: Hub repository ID
|
99 |
+
|
100 |
+
Returns:
|
101 |
+
True if download succeeded, False otherwise
|
102 |
+
"""
|
103 |
+
try:
|
104 |
+
# Use environment variable HUGGING_FACE_STORAGE_REPO if available
|
105 |
+
# Otherwise, use default value
|
106 |
+
if repo_id is None:
|
107 |
+
repo_id = os.getenv("HUGGING_FACE_STORAGE_REPO", "leaderboard-explorer/leaderboard_explorer")
|
108 |
+
if os.getenv("HUGGING_FACE_STORAGE_REPO"):
|
109 |
+
print(f"Using source dataset specified in HUGGING_FACE_STORAGE_REPO: {repo_id}")
|
110 |
+
else:
|
111 |
+
print(f"No source dataset specified, using default value: {repo_id}")
|
112 |
+
|
113 |
+
# Check if token is available
|
114 |
+
token = os.getenv("HUGGING_FACE_HUB_TOKEN")
|
115 |
+
if not token:
|
116 |
+
print("ERROR: Environment variable HUGGING_FACE_HUB_TOKEN is not defined.")
|
117 |
+
return False
|
118 |
+
|
119 |
+
# Connect to Hub
|
120 |
+
login(token=token)
|
121 |
+
|
122 |
+
# Create data directory if it doesn't exist
|
123 |
+
script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
124 |
+
data_dir = os.path.join(script_dir, "data")
|
125 |
+
os.makedirs(data_dir, exist_ok=True)
|
126 |
+
|
127 |
+
# List of required and optional files
|
128 |
+
required_files = [
|
129 |
+
"final_leaderboards.json",
|
130 |
+
"best_model_for_category_list.json"
|
131 |
+
]
|
132 |
+
|
133 |
+
optional_files = [
|
134 |
+
"best_model_for_results.json"
|
135 |
+
]
|
136 |
+
|
137 |
+
# Download required files first
|
138 |
+
for filename in required_files:
|
139 |
+
local_path = os.path.join(data_dir, filename)
|
140 |
+
try:
|
141 |
+
# Download file
|
142 |
+
print(f"Downloading {filename} from {repo_id}...")
|
143 |
+
hf_hub_download(
|
144 |
+
repo_id=repo_id,
|
145 |
+
filename=filename,
|
146 |
+
repo_type="dataset",
|
147 |
+
local_dir=data_dir,
|
148 |
+
local_dir_use_symlinks=False
|
149 |
+
)
|
150 |
+
print(f"File {filename} successfully downloaded to {local_path}")
|
151 |
+
except Exception as e:
|
152 |
+
print(f"ERROR: Unable to download required file {filename}: {e}")
|
153 |
+
return False
|
154 |
+
|
155 |
+
# Download optional files next
|
156 |
+
for filename in optional_files:
|
157 |
+
local_path = os.path.join(data_dir, filename)
|
158 |
+
try:
|
159 |
+
print(f"Downloading {filename} from {repo_id}...")
|
160 |
+
hf_hub_download(
|
161 |
+
repo_id=repo_id,
|
162 |
+
filename=filename,
|
163 |
+
repo_type="dataset",
|
164 |
+
local_dir=data_dir,
|
165 |
+
local_dir_use_symlinks=False
|
166 |
+
)
|
167 |
+
print(f"File {filename} successfully downloaded to {local_path}")
|
168 |
+
except Exception as e:
|
169 |
+
print(f"WARNING: Unable to download optional file {filename}: {e}")
|
170 |
+
print(f"This is not a problem, a new file will be created if necessary.")
|
171 |
+
|
172 |
+
return True
|
173 |
+
except Exception as e:
|
174 |
+
print(f"ERROR during file download from Hub: {e}")
|
175 |
+
return False
|
src/leaderboard_processor.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Leaderboard processing.
|
3 |
+
"""
|
4 |
+
import datetime
|
5 |
+
import os
|
6 |
+
from src.agents.parser.parser_agent import process_leaderboard
|
7 |
+
from src.file_utils import create_category_slug, split_combined_id
|
8 |
+
|
9 |
+
|
10 |
+
def normalize_category(category_name):
|
11 |
+
"""
|
12 |
+
Normalizes a category name by replacing spaces and underscores with hyphens and converting to lowercase.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
category_name: The category name to normalize
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
The normalized category
|
19 |
+
"""
|
20 |
+
# Use the create_category_slug function from file_utils.py
|
21 |
+
return create_category_slug(category_name)
|
22 |
+
|
23 |
+
|
24 |
+
def process_single_leaderboard(uid, host, model, index, all_results, additional_rules=None, category=None):
|
25 |
+
"""
|
26 |
+
Process a single leaderboard and update the results.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
uid: The UID of the leaderboard to process
|
30 |
+
host: The URL of the leaderboard
|
31 |
+
model: The model to use
|
32 |
+
index: The index of the leaderboard
|
33 |
+
all_results: The list of all results
|
34 |
+
additional_rules: Additional specific rules for this leaderboard
|
35 |
+
category: The category of the leaderboard (for combined identifier)
|
36 |
+
|
37 |
+
Returns:
|
38 |
+
The updated list of results
|
39 |
+
"""
|
40 |
+
print(f"\n\nProcessing leaderboard: {uid} - {host}")
|
41 |
+
if additional_rules:
|
42 |
+
print(f"Additional rules for this leaderboard: {additional_rules}")
|
43 |
+
if category:
|
44 |
+
normalized_category = normalize_category(category)
|
45 |
+
print(f"Category: {category} (normalized: {normalized_category})")
|
46 |
+
else:
|
47 |
+
normalized_category = None
|
48 |
+
|
49 |
+
# Get the maximum number of retries from environment variables
|
50 |
+
max_retries = int(os.getenv("LEADERBOARD_MAX_RETRIES", "3"))
|
51 |
+
print(f"Maximum number of retries configured: {max_retries}")
|
52 |
+
|
53 |
+
attempt = 0
|
54 |
+
last_error = None
|
55 |
+
|
56 |
+
# Try to process the leaderboard multiple times
|
57 |
+
while attempt < max_retries:
|
58 |
+
attempt += 1
|
59 |
+
if attempt > 1:
|
60 |
+
print(f"Retry attempt {attempt}/{max_retries} for leaderboard {uid} - {host}")
|
61 |
+
|
62 |
+
# Process the leaderboard
|
63 |
+
result = process_leaderboard(host, model, index, uid, additional_rules)
|
64 |
+
|
65 |
+
# If the parsing was successful or we've reached the maximum number of retries
|
66 |
+
if result.get("parsing_status") == "success" or attempt >= max_retries:
|
67 |
+
break
|
68 |
+
|
69 |
+
# If there was an error, save it for later
|
70 |
+
if result.get("parsing_status") == "error":
|
71 |
+
last_error = result.get("parsing_message", "Unknown error")
|
72 |
+
print(f"Error during attempt {attempt}: {last_error}")
|
73 |
+
|
74 |
+
# Get parsing date from result or generate a new one if not available
|
75 |
+
if result and "parsed_at" in result:
|
76 |
+
parsed_at = result["parsed_at"]
|
77 |
+
else:
|
78 |
+
# Fallback to current time if not provided by process_leaderboard
|
79 |
+
now = datetime.datetime.now()
|
80 |
+
parsed_at = now.isoformat()
|
81 |
+
|
82 |
+
# Create combined ID if category is provided
|
83 |
+
result_uid = uid
|
84 |
+
if normalized_category:
|
85 |
+
# Format of the combined UID: category_uid
|
86 |
+
# The category is already normalized (slugified) by normalize_category
|
87 |
+
# The underscore "_" is the ONLY separator between the category and the UID
|
88 |
+
result_uid = f"{normalized_category}_{uid}"
|
89 |
+
|
90 |
+
# Create base result object with uid, host, and thumbnail
|
91 |
+
leaderboard_result = {
|
92 |
+
"uid": result_uid,
|
93 |
+
"original_uid": uid,
|
94 |
+
"category": normalized_category,
|
95 |
+
"host": host,
|
96 |
+
"parsing_status": "rejected", # Default to rejected
|
97 |
+
"parsed_at": parsed_at
|
98 |
+
}
|
99 |
+
|
100 |
+
# Check if we have valid results
|
101 |
+
valid_result = False
|
102 |
+
if result and result.get("results"):
|
103 |
+
if isinstance(result["results"], dict):
|
104 |
+
# Check if we have top models with required fields
|
105 |
+
if "top_models" in result["results"] and len(result["results"]["top_models"]) > 0:
|
106 |
+
valid_models = True
|
107 |
+
for model_info in result["results"]["top_models"]:
|
108 |
+
# Each model must have at least rank and name
|
109 |
+
if not model_info.get("rank") or not model_info.get("name"):
|
110 |
+
valid_models = False
|
111 |
+
break
|
112 |
+
|
113 |
+
# Check if we have evaluation criteria
|
114 |
+
if valid_models and "evaluation_criteria" in result["results"] and result["results"]["evaluation_criteria"]:
|
115 |
+
valid_result = True
|
116 |
+
else:
|
117 |
+
print(f"Invalid results format: {type(result['results']).__name__}, expected dict")
|
118 |
+
else:
|
119 |
+
print(f"Missing or empty results in agent response")
|
120 |
+
|
121 |
+
# If we have valid results, extract the data
|
122 |
+
if valid_result:
|
123 |
+
leaderboard_result["parsing_status"] = "approved"
|
124 |
+
leaderboard_result["top_models"] = []
|
125 |
+
leaderboard_result["evaluation_criteria"] = result["results"]["evaluation_criteria"]
|
126 |
+
|
127 |
+
# Extract top models
|
128 |
+
for model_info in result["results"]["top_models"]:
|
129 |
+
model_entry = {
|
130 |
+
"rank": model_info.get("rank"),
|
131 |
+
"name": model_info.get("name"),
|
132 |
+
"url": model_info.get("url", None)
|
133 |
+
}
|
134 |
+
leaderboard_result["top_models"].append(model_entry)
|
135 |
+
else:
|
136 |
+
print(f"Leaderboard rejected: {uid} - Incomplete or invalid information")
|
137 |
+
|
138 |
+
# Check if this UID already exists in the results
|
139 |
+
for i, existing_result in enumerate(all_results):
|
140 |
+
if existing_result["uid"] == result_uid:
|
141 |
+
# Replace the existing result
|
142 |
+
all_results[i] = leaderboard_result
|
143 |
+
print(f"Result updated for UID: {result_uid}")
|
144 |
+
return all_results
|
145 |
+
|
146 |
+
# ADDITIONAL CHECK: Make sure there's no confusion with other categories
|
147 |
+
# for the same original_uid
|
148 |
+
for existing_result in all_results:
|
149 |
+
if existing_result["original_uid"] == uid and existing_result["category"] != normalized_category:
|
150 |
+
print(f"WARNING: A result already exists for original_uid {uid} but with a different category:")
|
151 |
+
print(f" - Existing category: {existing_result['category']}, UID: {existing_result['uid']}")
|
152 |
+
print(f" - New category: {normalized_category}, UID: {result_uid}")
|
153 |
+
# We continue anyway, as it's a valid case to have the same leaderboard in different categories
|
154 |
+
|
155 |
+
# If we get here, this is a new result
|
156 |
+
all_results.append(leaderboard_result)
|
157 |
+
print(f"New result added for UID: {result_uid}")
|
158 |
+
return all_results
|
src/processor.py
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Leaderboard processing module for the leaderboard parser.
|
3 |
+
This module contains the main functions for processing leaderboards.
|
4 |
+
"""
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
import datetime
|
8 |
+
import logging
|
9 |
+
import time
|
10 |
+
import argparse
|
11 |
+
from typing import Dict, Any, List, Tuple, Optional
|
12 |
+
|
13 |
+
# Import functions from other modules
|
14 |
+
from src.file_utils import save_results, format_datetime, clean_output_files, update_leaderboard_result
|
15 |
+
from src.file_utils import create_category_slug, split_combined_id, create_combined_id
|
16 |
+
from src.file_utils import load_and_validate_results, validate_leaderboard_result
|
17 |
+
from src.hub_utils import upload_to_hub, download_from_hub
|
18 |
+
from src.leaderboard_processor import process_single_leaderboard
|
19 |
+
from src.agents.parser.parser_agent import get_default_model
|
20 |
+
from src.agents.browser import cleanup_browser
|
21 |
+
|
22 |
+
# Configure logger
|
23 |
+
logger = logging.getLogger("leaderboard-parser")
|
24 |
+
|
25 |
+
# Update state variables in server module
|
26 |
+
def update_server_status(status, error=None):
|
27 |
+
"""
|
28 |
+
Updates the server status.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
status: The new status ('idle', 'running', 'completed', 'failed')
|
32 |
+
error: The error message in case of failure
|
33 |
+
"""
|
34 |
+
try:
|
35 |
+
from src.server import processing_status, processing_error
|
36 |
+
|
37 |
+
# Update global variables in server.py
|
38 |
+
globals()['processing_status'] = status
|
39 |
+
globals()['processing_error'] = error
|
40 |
+
|
41 |
+
# Update server module variables
|
42 |
+
import src.server
|
43 |
+
src.server.processing_status = status
|
44 |
+
src.server.processing_error = error
|
45 |
+
except ImportError:
|
46 |
+
# In non-server mode, these variables don't exist
|
47 |
+
pass
|
48 |
+
|
49 |
+
def process_leaderboards(args_dict=None) -> Tuple[bool, str]:
|
50 |
+
"""
|
51 |
+
Process leaderboards with the given arguments.
|
52 |
+
Returns a tuple of (success, message)
|
53 |
+
"""
|
54 |
+
# Update status
|
55 |
+
update_server_status("running")
|
56 |
+
|
57 |
+
# Set default arguments if none provided
|
58 |
+
if args_dict is None:
|
59 |
+
args_dict = {"local_only": False}
|
60 |
+
|
61 |
+
# Create an argparse.Namespace object from the dictionary
|
62 |
+
args = argparse.Namespace(**args_dict)
|
63 |
+
|
64 |
+
try:
|
65 |
+
# Ensure we're in the correct directory
|
66 |
+
script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
67 |
+
os.chdir(script_dir)
|
68 |
+
|
69 |
+
# Verify that the HF token is set
|
70 |
+
if not os.environ.get("HUGGING_FACE_HUB_TOKEN") and not args.local_only:
|
71 |
+
raise ValueError("HUGGING_FACE_HUB_TOKEN environment variable is not set!")
|
72 |
+
|
73 |
+
# Use default paths for category list and leaderboards
|
74 |
+
category_list_file = "data/best_model_for_category_list.json"
|
75 |
+
leaderboards_file = "data/final_leaderboards.json"
|
76 |
+
results_file = "data/best_model_for_results.json"
|
77 |
+
|
78 |
+
# Clean if requested
|
79 |
+
if getattr(args, "clean", False):
|
80 |
+
clean_output_files(results_file)
|
81 |
+
|
82 |
+
# Check if we're just uploading
|
83 |
+
if getattr(args, "upload_only", False):
|
84 |
+
upload_to_hub(to_parse_file=category_list_file, results_file=results_file)
|
85 |
+
update_server_status("completed")
|
86 |
+
return True, "Upload completed successfully"
|
87 |
+
|
88 |
+
# Download data from the Hub if not in local-only mode
|
89 |
+
if not getattr(args, "local_only", False):
|
90 |
+
download_from_hub()
|
91 |
+
|
92 |
+
# Just before the line that generates the error
|
93 |
+
logger.info(f"Starting leaderboard processing")
|
94 |
+
|
95 |
+
# Load the category list and leaderboards data
|
96 |
+
try:
|
97 |
+
with open(category_list_file, "r", encoding="utf-8") as f:
|
98 |
+
category_list = json.load(f)
|
99 |
+
|
100 |
+
with open(leaderboards_file, "r", encoding="utf-8") as f:
|
101 |
+
leaderboards = json.load(f)
|
102 |
+
|
103 |
+
# Create a mapping UID -> HOST for all leaderboards
|
104 |
+
uid_to_host = {lb["uid"]: lb["host"] for lb in leaderboards if "uid" in lb and "host" in lb}
|
105 |
+
logger.info(f"Loaded {len(uid_to_host)} UID -> HOST mappings from {leaderboards_file}")
|
106 |
+
except FileNotFoundError as e:
|
107 |
+
update_server_status("failed", str(e))
|
108 |
+
return False, f"File not found: {e}"
|
109 |
+
|
110 |
+
# Load existing results if any
|
111 |
+
try:
|
112 |
+
logger.info(f"Loading and validating results from {results_file}")
|
113 |
+
results_data = load_and_validate_results(results_file)
|
114 |
+
all_results = results_data
|
115 |
+
logger.info(f"Loaded and validated {len(all_results)} existing results")
|
116 |
+
except Exception as e:
|
117 |
+
logger.warning(f"Error loading results: {str(e)}")
|
118 |
+
results_data = []
|
119 |
+
all_results = []
|
120 |
+
|
121 |
+
# Create a map of combined UIDs to their complete data (for checking parsing date)
|
122 |
+
processed_results_map = {}
|
123 |
+
for result in results_data:
|
124 |
+
if "uid" in result:
|
125 |
+
processed_results_map[result["uid"]] = result
|
126 |
+
|
127 |
+
# Get reprocessing interval from environment variable (in hours)
|
128 |
+
# Default value: 24 hours
|
129 |
+
reprocess_interval_hours = int(os.getenv("LEADERBOARD_REPROCESS_INTERVAL_HOURS", "24"))
|
130 |
+
|
131 |
+
# Maximum age without update (in seconds)
|
132 |
+
max_age_seconds = reprocess_interval_hours * 60 * 60
|
133 |
+
logger.info(f"Leaderboard reprocessing interval: {reprocess_interval_hours} hours")
|
134 |
+
|
135 |
+
# Current date and time
|
136 |
+
now = datetime.datetime.now()
|
137 |
+
print(f"Current system date: {now.isoformat()} - Readable format: {format_datetime(now.isoformat())}")
|
138 |
+
|
139 |
+
# Get the default agent
|
140 |
+
model = get_default_model()
|
141 |
+
|
142 |
+
# Collect all leaderboards to process
|
143 |
+
leaderboards_to_process = []
|
144 |
+
force_retry_leaderboards = []
|
145 |
+
|
146 |
+
# Add logs for debugging
|
147 |
+
logger.info(f"Available categories: {len(category_list)}")
|
148 |
+
logger.info(f"Available leaderboards: {len(uid_to_host)}")
|
149 |
+
logger.info(f"Sample of available UIDs: {list(uid_to_host.keys())[:5]}")
|
150 |
+
|
151 |
+
# Check if a specific category is requested
|
152 |
+
target_category = getattr(args, "force_retry_category", None)
|
153 |
+
target_uid = getattr(args, "force_retry_uid", None)
|
154 |
+
|
155 |
+
# Exclusive mode (only process specified leaderboards)
|
156 |
+
exclusive_mode = target_category is not None or target_uid is not None
|
157 |
+
|
158 |
+
if target_category:
|
159 |
+
logger.info(f"Force retry category mode enabled (exclusive): {target_category}")
|
160 |
+
|
161 |
+
if target_uid:
|
162 |
+
logger.info(f"Force retry UID mode enabled (exclusive): {target_uid}")
|
163 |
+
|
164 |
+
# Process leaderboards
|
165 |
+
for category in category_list:
|
166 |
+
category_name = category["category"]
|
167 |
+
normalized_category = create_category_slug(category_name)
|
168 |
+
|
169 |
+
# If in specific category mode and this is not the target category, skip to the next
|
170 |
+
if target_category and target_category != normalized_category:
|
171 |
+
logger.info(f"Category {category_name} (normalized: {normalized_category}) ignored - Does not match target category {target_category}")
|
172 |
+
continue
|
173 |
+
|
174 |
+
# ADDITIONAL SAFETY: Reload data from file before each new category
|
175 |
+
# This ensures there is no contamination between categories
|
176 |
+
try:
|
177 |
+
logger.info(f"Reloading data from file before processing category: {category_name}")
|
178 |
+
all_results = load_and_validate_results(results_file)
|
179 |
+
logger.info(f"Data reloaded successfully: {len(all_results)} results available")
|
180 |
+
except Exception as e:
|
181 |
+
logger.warning(f"Unable to reload data before category {category_name}: {str(e)}")
|
182 |
+
# In case of error, keep existing data if possible
|
183 |
+
if not isinstance(all_results, list):
|
184 |
+
all_results = []
|
185 |
+
|
186 |
+
# Check if category has leaderboards
|
187 |
+
if "leaderboards" not in category or not isinstance(category["leaderboards"], list):
|
188 |
+
logger.warning(f"Category '{category_name}' has no leaderboards or incorrect format.")
|
189 |
+
continue
|
190 |
+
|
191 |
+
# Process each leaderboard in the category
|
192 |
+
for leaderboard in category["leaderboards"]:
|
193 |
+
if "uid" not in leaderboard:
|
194 |
+
logger.warning(f"Leaderboard in category '{category_name}' has no UID.")
|
195 |
+
continue
|
196 |
+
|
197 |
+
leaderboard_uid = leaderboard["uid"]
|
198 |
+
|
199 |
+
# In specific UID mode, ignore all other leaderboards
|
200 |
+
if target_uid and target_uid != leaderboard_uid:
|
201 |
+
logger.info(f"Leaderboard {leaderboard_uid} ignored - Does not match target UID {target_uid}")
|
202 |
+
continue
|
203 |
+
|
204 |
+
# Get additional rules if available
|
205 |
+
additional_rules = leaderboard.get("additionnal_agent_rules", None)
|
206 |
+
|
207 |
+
# Check if we should force processing this leaderboard
|
208 |
+
# Using the new distinct options
|
209 |
+
force_retry_uid = getattr(args, "force_retry_uid", None) == leaderboard_uid
|
210 |
+
force_retry_category = getattr(args, "force_retry_category", None) == normalized_category
|
211 |
+
|
212 |
+
# Support for the old option for backward compatibility (to be removed later)
|
213 |
+
legacy_force_retry = False
|
214 |
+
if hasattr(args, "force_retry") and getattr(args, "force_retry", None) is not None:
|
215 |
+
legacy_force_retry = (
|
216 |
+
getattr(args, "force_retry", None) == leaderboard_uid or
|
217 |
+
getattr(args, "force_retry", None) == normalized_category
|
218 |
+
)
|
219 |
+
if legacy_force_retry:
|
220 |
+
logger.warning("The --force-retry option is obsolete. Use --force-retry-uid or --force-retry-category instead.")
|
221 |
+
|
222 |
+
# Combine different sources of force_retry
|
223 |
+
force_retry = force_retry_uid or force_retry_category or legacy_force_retry
|
224 |
+
|
225 |
+
# Add explicit logs about the reason for force retry
|
226 |
+
if force_retry:
|
227 |
+
if force_retry_uid:
|
228 |
+
logger.info(f"Force retry enabled for leaderboard UID: {leaderboard_uid}")
|
229 |
+
elif force_retry_category:
|
230 |
+
logger.info(f"Force retry enabled for all leaderboards in category: {normalized_category}")
|
231 |
+
elif legacy_force_retry:
|
232 |
+
logger.info(f"Force retry enabled via the old --force-retry option for: {getattr(args, 'force_retry', None)}")
|
233 |
+
|
234 |
+
# Search for the leaderboard URL in uid_to_host (direct dictionary lookup)
|
235 |
+
host = uid_to_host.get(leaderboard_uid)
|
236 |
+
|
237 |
+
if not host:
|
238 |
+
logger.warning(f"UID '{leaderboard_uid}' (category: {normalized_category}) not found in leaderboards.")
|
239 |
+
# Show more information for debugging
|
240 |
+
logger.debug(f"Total number of UIDs available: {len(uid_to_host)}")
|
241 |
+
continue
|
242 |
+
|
243 |
+
# Create combined identifier (category_uid)
|
244 |
+
# The category is already normalized by create_category_slug
|
245 |
+
combined_uid = create_combined_id(normalized_category, leaderboard_uid)
|
246 |
+
|
247 |
+
# If force_retry is enabled, process the leaderboard without checking the time since last processing
|
248 |
+
if force_retry:
|
249 |
+
logger.info(f"Force retry enabled for {combined_uid} - Processing forced independently of last processing date.")
|
250 |
+
leaderboards_to_process.append({
|
251 |
+
"uid": leaderboard_uid,
|
252 |
+
"host": host,
|
253 |
+
"category": normalized_category,
|
254 |
+
"additional_rules": additional_rules,
|
255 |
+
"force_retry": force_retry
|
256 |
+
})
|
257 |
+
continue # Skip directly to the next leaderboard
|
258 |
+
|
259 |
+
# Check if the leaderboard has already been processed recently
|
260 |
+
needs_reprocessing = True
|
261 |
+
if combined_uid in processed_results_map:
|
262 |
+
# Check if the leaderboard has been processed within the interval
|
263 |
+
result = processed_results_map[combined_uid]
|
264 |
+
|
265 |
+
# If the --retry-rejected option is active and the status is "rejected", force reprocessing
|
266 |
+
if getattr(args, "retry_rejected", False) and result.get("parsing_status") == "rejected":
|
267 |
+
logger.info(f"Leaderboard {combined_uid} previously rejected, forced reprocessing with --retry-rejected.")
|
268 |
+
elif "parsed_at" in result:
|
269 |
+
try:
|
270 |
+
# Convert parsing date to datetime object
|
271 |
+
parsed_at = datetime.datetime.fromisoformat(result["parsed_at"])
|
272 |
+
|
273 |
+
# Calculate time elapsed since last parsing
|
274 |
+
time_diff = now - parsed_at
|
275 |
+
|
276 |
+
# Add logs for debugging date checks
|
277 |
+
logger.info(f"DEBUG: Current date: {now.isoformat()}")
|
278 |
+
logger.info(f"DEBUG: Last parsing date: {parsed_at.isoformat()}")
|
279 |
+
logger.info(f"DEBUG: Time difference in seconds: {time_diff.total_seconds()}")
|
280 |
+
logger.info(f"DEBUG: Reprocessing threshold (seconds): {max_age_seconds}")
|
281 |
+
|
282 |
+
# Strictly check if the duration in seconds is greater than the threshold
|
283 |
+
time_seconds = time_diff.total_seconds()
|
284 |
+
|
285 |
+
# If time elapsed is greater than max_age_seconds, reparse
|
286 |
+
if time_seconds > max_age_seconds:
|
287 |
+
needs_reprocessing = True
|
288 |
+
print(f"\n\nLeaderboard {combined_uid} - {host} parsed more than {reprocess_interval_hours} hours ago ({format_datetime(result['parsed_at'])}), reprocessing necessary.")
|
289 |
+
else:
|
290 |
+
print(f"\n\nLeaderboard {combined_uid} - {host} already processed recently ({format_datetime(result['parsed_at'])}), moving to next. Age: {time_seconds} seconds (threshold: {max_age_seconds})")
|
291 |
+
continue
|
292 |
+
except (ValueError, TypeError):
|
293 |
+
# If date is invalid, reprocess by precaution
|
294 |
+
logger.info(f"Leaderboard {combined_uid} has an invalid processing date, reprocessing necessary.")
|
295 |
+
else:
|
296 |
+
# If parsing date is missing, reprocess by precaution
|
297 |
+
logger.info(f"Leaderboard {combined_uid} has no processing date, reprocessing necessary.")
|
298 |
+
else:
|
299 |
+
# If the leaderboard has never been processed, process it
|
300 |
+
logger.info(f"New leaderboard {combined_uid} to process.")
|
301 |
+
|
302 |
+
if needs_reprocessing or force_retry:
|
303 |
+
leaderboards_to_process.append({
|
304 |
+
"uid": leaderboard_uid,
|
305 |
+
"host": host,
|
306 |
+
"category": normalized_category,
|
307 |
+
"additional_rules": additional_rules,
|
308 |
+
"force_retry": force_retry
|
309 |
+
})
|
310 |
+
|
311 |
+
# Information on the number of leaderboards to process
|
312 |
+
logger.info(f"Total number of leaderboards to process: {len(leaderboards_to_process)}")
|
313 |
+
|
314 |
+
# Process each leaderboard
|
315 |
+
for index, leaderboard_info in enumerate(leaderboards_to_process):
|
316 |
+
leaderboard_uid = leaderboard_info["uid"]
|
317 |
+
host = leaderboard_info["host"]
|
318 |
+
category_name = leaderboard_info["category"]
|
319 |
+
additional_rules = leaderboard_info["additional_rules"]
|
320 |
+
force_retry = leaderboard_info["force_retry"]
|
321 |
+
|
322 |
+
# Process this leaderboard
|
323 |
+
logger.info(f"Processing leaderboard {index+1}/{len(leaderboards_to_process)}: {leaderboard_uid} (category: {category_name})")
|
324 |
+
|
325 |
+
try:
|
326 |
+
# Force restart of browser every 2 leaderboards to avoid memory leaks
|
327 |
+
if index > 0 and index % 2 == 0:
|
328 |
+
logger.info(f"Periodic browser cleanup after {index} leaderboards to avoid memory leaks")
|
329 |
+
cleanup_browser()
|
330 |
+
# Force garbage collection
|
331 |
+
import gc
|
332 |
+
gc.collect()
|
333 |
+
# Small pause to let the system clean up
|
334 |
+
time.sleep(3)
|
335 |
+
|
336 |
+
# Process the leaderboard
|
337 |
+
all_results = process_single_leaderboard(
|
338 |
+
leaderboard_uid,
|
339 |
+
host,
|
340 |
+
model,
|
341 |
+
index,
|
342 |
+
all_results,
|
343 |
+
additional_rules,
|
344 |
+
category_name
|
345 |
+
)
|
346 |
+
|
347 |
+
# Add detailed logs for diagnosing problems
|
348 |
+
logger.info(f"Results after processing: {len(all_results)} elements")
|
349 |
+
# Search for results corresponding to the processed leaderboard
|
350 |
+
for idx, res in enumerate(all_results):
|
351 |
+
if res.get("original_uid") == leaderboard_uid:
|
352 |
+
logger.info(f"Found result {idx}: uid={res.get('uid')}, original_uid={res.get('original_uid')}, category={res.get('category')}")
|
353 |
+
|
354 |
+
# Clean up after each processing
|
355 |
+
cleanup_browser()
|
356 |
+
|
357 |
+
# Verify if the leaderboard exists with the exact normalized category
|
358 |
+
# MODIFICATION: Strict search by original_uid AND category
|
359 |
+
normalized_category_name = create_category_slug(category_name)
|
360 |
+
current_result = None
|
361 |
+
for result in all_results:
|
362 |
+
# Always compare normalized categories to avoid format issues
|
363 |
+
result_category = result.get("category", "")
|
364 |
+
if result.get("original_uid") == leaderboard_uid and create_category_slug(result_category) == normalized_category_name:
|
365 |
+
current_result = result
|
366 |
+
logger.info(f"Found result for {leaderboard_uid}, category: {result.get('category')}")
|
367 |
+
break
|
368 |
+
|
369 |
+
# SUPPRESSION: No longer search for alternatives with only original_uid
|
370 |
+
# If result is not found, it's probably an error or processing failed
|
371 |
+
if not current_result:
|
372 |
+
logger.error(f"RESULT NOT FOUND for {leaderboard_uid}, normalized_category: {normalized_category_name}")
|
373 |
+
logger.error(f"Search for all results corresponding to this UID:")
|
374 |
+
for res in all_results:
|
375 |
+
if res.get("original_uid") == leaderboard_uid:
|
376 |
+
logger.error(f" - Result with category={res.get('category')}, uid={res.get('uid')}")
|
377 |
+
logger.error(f"Leaderboard {leaderboard_uid} (category: {category_name}) not updated because result not found")
|
378 |
+
continue
|
379 |
+
|
380 |
+
# Update only this specific leaderboard in the results file
|
381 |
+
logger.info(f"Updating leaderboard {leaderboard_uid} (category: {category_name}) in file")
|
382 |
+
updated_results = update_leaderboard_result(current_result, results_file)
|
383 |
+
|
384 |
+
# CORRECTION CRITIQUE: Update all_results list with file data
|
385 |
+
# to avoid desynchronization between file and in-memory list
|
386 |
+
all_results = updated_results
|
387 |
+
|
388 |
+
# Update global result for next leaderboard
|
389 |
+
results_data = updated_results
|
390 |
+
|
391 |
+
logger.info(f"Leaderboard {leaderboard_uid} (category: {category_name}) saved")
|
392 |
+
|
393 |
+
# Upload to HF Hub after each leaderboard if not in local-only mode
|
394 |
+
if not getattr(args, "local_only", False):
|
395 |
+
logger.info(f"Uploading results to HF Hub after processing leaderboard {leaderboard_uid}")
|
396 |
+
try:
|
397 |
+
upload_to_hub(to_parse_file=category_list_file, results_file=results_file)
|
398 |
+
logger.info(f"Upload successful to HF Hub for leaderboard {leaderboard_uid}")
|
399 |
+
except Exception as upload_err:
|
400 |
+
logger.warning(f"Upload to HF Hub failed after processing leaderboard {leaderboard_uid}: {str(upload_err)}")
|
401 |
+
except Exception as e:
|
402 |
+
logger.error(f"Error processing leaderboard {leaderboard_uid} (category: {category_name}): {str(e)}")
|
403 |
+
continue
|
404 |
+
|
405 |
+
# Final save not necessary as all leaderboards have already been updated individually
|
406 |
+
logger.info("Leaderboard processing completed")
|
407 |
+
|
408 |
+
update_server_status("completed")
|
409 |
+
return True, "Processing completed successfully"
|
410 |
+
|
411 |
+
except Exception as e:
|
412 |
+
update_server_status("failed", str(e))
|
413 |
+
logger.exception("Error processing leaderboards")
|
414 |
+
return False, f"Error processing leaderboards: {str(e)}"
|
src/scheduler.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Scheduling module for the leaderboard parser.
|
3 |
+
This module contains scheduling functions for periodic execution of leaderboard processing.
|
4 |
+
"""
|
5 |
+
import datetime
|
6 |
+
import threading
|
7 |
+
import time
|
8 |
+
import logging
|
9 |
+
import os
|
10 |
+
|
11 |
+
# Initialize logger
|
12 |
+
logger = logging.getLogger("leaderboard-parser")
|
13 |
+
|
14 |
+
# Global variables for scheduler
|
15 |
+
stop_thread = False
|
16 |
+
last_run_time = None
|
17 |
+
|
18 |
+
# Reference to the processing function (will be defined by initialize_scheduler)
|
19 |
+
process_leaderboards_function = None
|
20 |
+
|
21 |
+
def initialize_scheduler(process_function):
|
22 |
+
"""
|
23 |
+
Initialize the scheduler with the leaderboard processing function.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
process_function: Function that processes leaderboards
|
27 |
+
"""
|
28 |
+
global process_leaderboards_function
|
29 |
+
process_leaderboards_function = process_function
|
30 |
+
logger.info("Scheduler initialized with processing function")
|
31 |
+
|
32 |
+
def scheduler_thread():
|
33 |
+
"""Thread that checks when to run the leaderboard processing job"""
|
34 |
+
global stop_thread, last_run_time, process_leaderboards_function
|
35 |
+
|
36 |
+
if not process_leaderboards_function:
|
37 |
+
logger.error("Scheduler has not been initialized with a processing function")
|
38 |
+
return
|
39 |
+
|
40 |
+
logger.info("Scheduler thread started")
|
41 |
+
|
42 |
+
# Get the reprocess interval from environment, default to 24 hours
|
43 |
+
interval_hours = int(os.environ.get("LEADERBOARD_REPROCESS_INTERVAL_HOURS", 24))
|
44 |
+
interval_seconds = interval_hours * 3600
|
45 |
+
|
46 |
+
logger.info(f"Leaderboard reprocess interval set to {interval_hours} hours")
|
47 |
+
|
48 |
+
while not stop_thread:
|
49 |
+
now = datetime.datetime.now()
|
50 |
+
|
51 |
+
# If no previous run or if the interval has passed since last run
|
52 |
+
if last_run_time is None or (now - last_run_time).total_seconds() >= interval_seconds:
|
53 |
+
logger.info(f"{interval_hours} hours have passed since last run, executing the job")
|
54 |
+
|
55 |
+
# Get the current status
|
56 |
+
from src.server import processing_status
|
57 |
+
|
58 |
+
# If we're not already processing
|
59 |
+
if processing_status != "running":
|
60 |
+
# Run the processing job
|
61 |
+
last_run_time = now
|
62 |
+
success, message = process_leaderboards_function({"local_only": False})
|
63 |
+
logger.info(f"Processing job completed with status: {success}, message: {message}")
|
64 |
+
|
65 |
+
# Wait at least 80% of the interval before checking again
|
66 |
+
# This prevents multiple executions and provides a buffer
|
67 |
+
time.sleep(interval_seconds * 0.8)
|
68 |
+
else:
|
69 |
+
# Calculate time until next run
|
70 |
+
seconds_until_next_run = interval_seconds - (now - last_run_time).total_seconds()
|
71 |
+
hours_until_next_run = seconds_until_next_run / 3600
|
72 |
+
|
73 |
+
# Log progress every hour
|
74 |
+
if int(seconds_until_next_run) % 3600 < 10: # Log within the first 10 seconds of each hour
|
75 |
+
logger.info(f"Next scheduled run in {hours_until_next_run:.1f} hours")
|
76 |
+
|
77 |
+
# Sleep for a minute before checking again
|
78 |
+
time.sleep(60)
|
79 |
+
|
80 |
+
def start_scheduler():
|
81 |
+
"""Start the scheduler thread"""
|
82 |
+
global stop_thread
|
83 |
+
|
84 |
+
# Reset stop flag
|
85 |
+
stop_thread = False
|
86 |
+
|
87 |
+
# Start the scheduler thread
|
88 |
+
scheduler = threading.Thread(target=scheduler_thread)
|
89 |
+
scheduler.daemon = True
|
90 |
+
scheduler.start()
|
91 |
+
|
92 |
+
logger.info("Scheduler thread started")
|
93 |
+
return scheduler
|
94 |
+
|
95 |
+
def stop_scheduler():
|
96 |
+
"""Stop the scheduler thread"""
|
97 |
+
global stop_thread
|
98 |
+
stop_thread = True
|
99 |
+
logger.info("Scheduler thread stop requested")
|
src/server.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Module de serveur API pour le leaderboard parser.
|
3 |
+
Ce module contient la configuration FastAPI et les endpoints pour le mode serveur.
|
4 |
+
"""
|
5 |
+
import datetime
|
6 |
+
import threading
|
7 |
+
import logging
|
8 |
+
import os
|
9 |
+
from fastapi import FastAPI, HTTPException
|
10 |
+
from fastapi.responses import JSONResponse
|
11 |
+
from src.file_utils import format_datetime
|
12 |
+
|
13 |
+
# Initialiser le logger
|
14 |
+
logger = logging.getLogger("leaderboard-parser")
|
15 |
+
|
16 |
+
# Variables globales pour suivre l'état du serveur
|
17 |
+
processing_status = "idle"
|
18 |
+
processing_error = None
|
19 |
+
last_run_time = None
|
20 |
+
|
21 |
+
# Initialiser l'application FastAPI
|
22 |
+
app = FastAPI(title="Leaderboard Parser API")
|
23 |
+
|
24 |
+
# Cette fonction sera importée depuis main.py
|
25 |
+
process_leaderboards = None
|
26 |
+
|
27 |
+
def initialize_server(process_function):
|
28 |
+
"""
|
29 |
+
Initialise le serveur avec la fonction de traitement des leaderboards.
|
30 |
+
Cette fonction doit être appelée avant de démarrer le serveur.
|
31 |
+
|
32 |
+
Args:
|
33 |
+
process_function: Fonction qui traite les leaderboards
|
34 |
+
"""
|
35 |
+
global process_leaderboards
|
36 |
+
process_leaderboards = process_function
|
37 |
+
logger.info("Serveur initialisé avec la fonction de traitement")
|
38 |
+
|
39 |
+
# Endpoints API
|
40 |
+
@app.get("/")
|
41 |
+
async def root():
|
42 |
+
"""Root endpoint returning basic info"""
|
43 |
+
return {
|
44 |
+
"name": "Leaderboard Parser API",
|
45 |
+
"status": "running",
|
46 |
+
"version": "1.0.0"
|
47 |
+
}
|
48 |
+
|
49 |
+
@app.get("/status")
|
50 |
+
async def get_status():
|
51 |
+
"""Get the current status of the parser"""
|
52 |
+
global processing_status, last_run_time, processing_error
|
53 |
+
|
54 |
+
return {
|
55 |
+
"status": processing_status,
|
56 |
+
"last_run": format_datetime(last_run_time) if last_run_time else None,
|
57 |
+
"next_run": format_datetime(last_run_time + datetime.timedelta(hours=int(os.environ.get("LEADERBOARD_REPROCESS_INTERVAL_HOURS", 24)))) if last_run_time else None,
|
58 |
+
"error": processing_error
|
59 |
+
}
|
60 |
+
|
61 |
+
@app.post("/run")
|
62 |
+
async def trigger_run():
|
63 |
+
"""Manually trigger a leaderboard processing run"""
|
64 |
+
global processing_status, process_leaderboards
|
65 |
+
|
66 |
+
if not process_leaderboards:
|
67 |
+
raise HTTPException(status_code=500, detail="Server not properly initialized")
|
68 |
+
|
69 |
+
if processing_status == "running":
|
70 |
+
raise HTTPException(status_code=409, detail="Processing is already running")
|
71 |
+
|
72 |
+
# Start processing in a separate thread
|
73 |
+
threading.Thread(target=lambda: process_leaderboards()).start()
|
74 |
+
|
75 |
+
return {"status": "started", "message": "Processing started"}
|