tfrere commited on
Commit
0821095
·
0 Parent(s):

first commit

Browse files
Files changed (47) hide show
  1. .env.example +15 -0
  2. .gitignore +3 -0
  3. Dockerfile +83 -0
  4. README.md +90 -0
  5. data/.DS_Store +0 -0
  6. data/best.backup.json +185 -0
  7. data/best_model_for_category_list.json +190 -0
  8. data/best_model_for_results.json.lock +0 -0
  9. experiments/simple_smolagent.py +31 -0
  10. experiments/smolagent_parser.py +221 -0
  11. experiments/vision_web_browser.py +210 -0
  12. main.py +87 -0
  13. poetry.lock +0 -0
  14. pyproject.toml +27 -0
  15. scripts/test_agent.py +121 -0
  16. src/__pycache__/agent.cpython-310.pyc +0 -0
  17. src/__pycache__/browser.cpython-310.pyc +0 -0
  18. src/__pycache__/browser_utils.cpython-310.pyc +0 -0
  19. src/__pycache__/file_utils.cpython-310.pyc +0 -0
  20. src/__pycache__/hub_utils.cpython-310.pyc +0 -0
  21. src/__pycache__/leaderboard_processor.cpython-310.pyc +0 -0
  22. src/__pycache__/processor.cpython-310.pyc +0 -0
  23. src/__pycache__/scheduler.cpython-310.pyc +0 -0
  24. src/__pycache__/server.cpython-310.pyc +0 -0
  25. src/__pycache__/tools.cpython-310.pyc +0 -0
  26. src/agents/__pycache__/__init__.cpython-310.pyc +0 -0
  27. src/agents/__pycache__/agent.cpython-310.pyc +0 -0
  28. src/agents/__pycache__/agent_core.cpython-310.pyc +0 -0
  29. src/agents/__pycache__/agent_instructions.cpython-310.pyc +0 -0
  30. src/agents/__pycache__/agent_processor.cpython-310.pyc +0 -0
  31. src/agents/__pycache__/agent_tools.cpython-310.pyc +0 -0
  32. src/agents/__pycache__/browser.cpython-310.pyc +0 -0
  33. src/agents/__pycache__/prompts.cpython-310.pyc +0 -0
  34. src/agents/__pycache__/tools.cpython-310.pyc +0 -0
  35. src/agents/__pycache__/validators.cpython-310.pyc +0 -0
  36. src/agents/browser.py +148 -0
  37. src/agents/fact_checker/fact_checker_agent.py +3 -0
  38. src/agents/parser/__pycache__/agent.cpython-310.pyc +0 -0
  39. src/agents/parser/__pycache__/parser_agent.cpython-310.pyc +0 -0
  40. src/agents/parser/parser_agent.py +362 -0
  41. src/agents/tools.py +443 -0
  42. src/file_utils.py +316 -0
  43. src/hub_utils.py +175 -0
  44. src/leaderboard_processor.py +158 -0
  45. src/processor.py +414 -0
  46. src/scheduler.py +99 -0
  47. src/server.py +75 -0
.env.example ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Hub token (required)
2
+ # Create one at https://huggingface.co/settings/tokens
3
+ HUGGING_FACE_HUB_TOKEN=your_token_here
4
+
5
+ OPENAI_API_KEY=sk-proj-xxxx
6
+
7
+ # Repository ID for storing leaderboard data (required)
8
+ # Format: username/repo-name
9
+ HUGGING_FACE_STORAGE_REPO=username/leaderboard-data
10
+
11
+ # Intervalle de temps en heures avant de retraiter un leaderboard déjà analysé
12
+ LEADERBOARD_REPROCESS_INTERVAL_HOURS=24
13
+
14
+ # Nombre maximum de tentatives de traitement d'un leaderboard avant de considérer qu'il a échoué
15
+ LEADERBOARD_MAX_RETRIES=3
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .env
2
+ data/best_model_for_results.json
3
+ data/final_leaderboards.json
Dockerfile ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+ WORKDIR /app
3
+
4
+ # Create non-root user
5
+ RUN useradd -m -u 1000 user
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ netcat-openbsd \
10
+ wget \
11
+ gnupg \
12
+ curl \
13
+ libnss3 \
14
+ libnspr4 \
15
+ libatk1.0-0 \
16
+ libatk-bridge2.0-0 \
17
+ libcups2 \
18
+ libdrm2 \
19
+ libdbus-1-3 \
20
+ libxkbcommon0 \
21
+ libx11-6 \
22
+ libxcomposite1 \
23
+ libxdamage1 \
24
+ libxext6 \
25
+ libxfixes3 \
26
+ libxrandr2 \
27
+ libgbm1 \
28
+ libpango-1.0-0 \
29
+ libcairo2 \
30
+ libasound2 \
31
+ libatspi2.0-0 \
32
+ unzip \
33
+ xvfb \
34
+ libglib2.0-0 \
35
+ && pip install --upgrade pip \
36
+ && pip install poetry
37
+
38
+ # Install Chrome - required for Helium
39
+ RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
40
+ && echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list \
41
+ && apt-get update \
42
+ && apt-get install -y google-chrome-stable \
43
+ && rm -rf /var/lib/apt/lists/*
44
+
45
+ # Configure environment variables for Chrome
46
+ ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver \
47
+ CHROME_PATH=/usr/bin/google-chrome-stable \
48
+ CHROME_BIN=/usr/bin/google-chrome-stable
49
+
50
+ # Copy application files
51
+ COPY . /app/
52
+
53
+ # Install Python dependencies
54
+ RUN poetry config virtualenvs.create false \
55
+ && poetry install --no-interaction --no-ansi
56
+
57
+ # Environment variables
58
+ ENV API_HOST=0.0.0.0 \
59
+ API_PORT=7860 \
60
+ PYTHONPATH=/app \
61
+ DISPLAY=:99 \
62
+ PYTHONUNBUFFERED=1 \
63
+ SELENIUM_DRIVER_EXECUTABLE_PATH=/usr/bin/chromedriver \
64
+ LEADERBOARD_REPROCESS_INTERVAL_HOURS=24 \
65
+ HOME=/home/user
66
+
67
+ # Create cache directory and set permissions
68
+ RUN mkdir -p /app/cache /home/user/.cache && chown -R user:user /app/cache /app/ /home/user/.cache
69
+
70
+ # Install additional fonts
71
+ RUN apt-get update && apt-get install -y \
72
+ fonts-noto-color-emoji \
73
+ fonts-freefont-ttf \
74
+ libharfbuzz-icu0 \
75
+ && rm -rf /var/lib/apt/lists/*
76
+
77
+ # Switch to non-root user
78
+ USER user
79
+
80
+ EXPOSE 7860
81
+
82
+ # Start the server in server mode
83
+ CMD ["python", "main.py", "--server", "--retry-rejected"]
README.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Leaderboard Parser
2
+
3
+ Un outil pour extraire automatiquement les données des leaderboards Hugging Face à l'aide d'agents IA.
4
+
5
+ ## Structure du projet
6
+
7
+ ```
8
+ leaderboard-parser/
9
+ ├── main.py # Point d'entrée principal
10
+ ├── data/ # Données d'entrée et de sortie
11
+ │ ├── leaderboards.json # Liste des URLs des leaderboards à traiter
12
+ │ └── leaderboard_results.json # Résultats de l'extraction
13
+ ├── src/ # Code source principal
14
+ │ ├── agent.py # Gestion de l'agent IA
15
+ │ ├── browser.py # Gestion du navigateur
16
+ │ └── tools.py # Outils utilisés par l'agent
17
+ ├── experiments/ # Scripts expérimentaux
18
+ ├── pyproject.toml # Configuration Poetry
19
+ └── README.md # Documentation
20
+ ```
21
+
22
+ ## Description
23
+
24
+ Ce projet utilise Playwright et smolagents pour naviguer sur les leaderboards Hugging Face et extraire les informations du premier modèle de chaque leaderboard. Les informations extraites incluent :
25
+
26
+ - Nom du modèle
27
+ - Score
28
+ - Position/rang
29
+ - Créateur/auteur
30
+
31
+ ## Prérequis
32
+
33
+ - Python 3.10 ou supérieur
34
+ - Poetry (gestionnaire de dépendances)
35
+
36
+ ## Installation
37
+
38
+ 1. Assurez-vous d'avoir Python 3.10+ installé
39
+ 2. Installez Poetry si ce n'est pas déjà fait : `pip install poetry`
40
+ 3. Installez les dépendances : `poetry install`
41
+
42
+ ## Configuration
43
+
44
+ 1. Copiez le fichier `.env.example` vers `.env`
45
+ 2. Configurez vos clés API dans le fichier `.env`
46
+ 3. Modifiez le fichier `data/leaderboards.json` pour ajouter ou supprimer des URLs de leaderboards
47
+
48
+ ## Utilisation
49
+
50
+ Pour exécuter le parser sur tous les leaderboards définis dans `data/leaderboards.json` :
51
+
52
+ ```bash
53
+ poetry run leaderboard-parser
54
+ ```
55
+
56
+ Ou directement :
57
+
58
+ ```bash
59
+ poetry run python main.py
60
+ ```
61
+
62
+ Les résultats seront sauvegardés dans `data/leaderboard_results.json`.
63
+
64
+ ## Fonctionnement
65
+
66
+ L'outil utilise un agent IA basé sur `smolagents` pour :
67
+
68
+ 1. Naviguer vers chaque URL de leaderboard
69
+ 2. Analyser la page pour trouver le tableau de classement
70
+ 3. Extraire les trois premiers modèles avec leurs scores
71
+ 4. Sauvegarder les résultats dans un fichier JSON
72
+
73
+ ## Développement
74
+
75
+ Pour ajouter un nouvel outil à l'agent, créez une fonction dans `src/tools.py` et décorez-la avec `@tool`.
76
+
77
+ Pour modifier les instructions données à l'agent, modifiez la variable `leaderboard_instructions` dans `src/agent.py`.
78
+
79
+ ## Expériences
80
+
81
+ Le dossier `experiments/` contient des scripts expérimentaux qui ont été utilisés pour développer et tester différentes approches.
82
+
83
+ ## Comment ça fonctionne
84
+
85
+ 1. Le script charge les URLs des leaderboards depuis `data/leaderboards.json`
86
+ 2. Pour chaque URL, il lance un navigateur et utilise un agent IA pour :
87
+ - Naviguer vers l'URL du leaderboard
88
+ - Analyser la page pour trouver les trois premiers modèles
89
+ - Extraire les informations pertinentes
90
+ 3. Les résultats sont sauvegardés dans `data/leaderboard_results.json`
data/.DS_Store ADDED
Binary file (6.15 kB). View file
 
data/best.backup.json ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "category": "text generation",
4
+ "emoji": "📝",
5
+ "leaderboards": [
6
+ {
7
+ "uid": "6468923b99182de17844bf7b",
8
+ "additionnal_agent_rules": "",
9
+ "is_open_source": false
10
+ },
11
+ {
12
+ "uid": "643d3016d2c1e08a5eca0c22",
13
+ "additionnal_agent_rules": "you have to check the 'only official providers' filter before trying to get the best models. It is mandatory to check this filter.",
14
+ "is_open_source": true
15
+ }
16
+ ]
17
+ },
18
+ {
19
+ "category": "uncensored text generation",
20
+ "emoji": "🔓",
21
+ "leaderboards": [
22
+ {
23
+ "uid": "65f0f612555caedb299e54d9",
24
+ "additionnal_agent_rules": "You have to remove models where are 'unavailable'",
25
+ "is_open_source": true
26
+ }
27
+ ]
28
+ },
29
+ {
30
+ "category": "image understanding",
31
+ "emoji": "📷",
32
+ "leaderboards": [
33
+ {
34
+ "uid": "6468923b99182de17844bf7b",
35
+ "additionnal_agent_rules": "you have to search for 'Arena (vision)' tab. You are searching for the best VLM models. It is mandatory to check this tab. If there is no informations about models in this tab, fail.",
36
+ "is_open_source": false
37
+ }
38
+ ]
39
+ },
40
+ {
41
+ "category": "agentic",
42
+ "emoji": "🤖",
43
+ "leaderboards": [
44
+ {
45
+ "uid": "67909d72a1832c8a7cdd4599",
46
+ "additionnal_agent_rules": "",
47
+ "is_open_source": false
48
+ }
49
+ ]
50
+ },
51
+ {
52
+ "category": "math",
53
+ "emoji": "🧮",
54
+ "leaderboards": [
55
+ {
56
+ "uid": "643d3016d2c1e08a5eca0c22",
57
+ "additionnal_agent_rules": "You have to click on MATH to sort the leaderboard by score. We are searching for the best math models.",
58
+ "is_open_source": true
59
+ }
60
+ ]
61
+ },
62
+ {
63
+ "category": "code",
64
+ "emoji": "💻",
65
+ "leaderboards": [
66
+ {
67
+ "uid": "6662b2c6cc6519da32cd6f4d",
68
+ "additionnal_agent_rules": "",
69
+ "is_open_source": false
70
+ },
71
+ {
72
+ "uid": "657b23848e7790a347c7e4ea",
73
+ "additionnal_agent_rules": "",
74
+ "is_open_source": false
75
+ }
76
+ ]
77
+ },
78
+ {
79
+ "category": "embedding",
80
+ "emoji": "📦",
81
+ "leaderboards": [
82
+ {
83
+ "uid": "633581939ac57cf2967be686",
84
+ "additionnal_agent_rules": "",
85
+ "is_open_source": false
86
+ }
87
+ ]
88
+ },
89
+ {
90
+ "category": "text to image",
91
+ "emoji": "🎨",
92
+ "leaderboards": [
93
+ {
94
+ "uid": "665e7241f8cb81b0a476eccb",
95
+ "additionnal_agent_rules": "",
96
+ "is_open_source": false
97
+ },
98
+ {
99
+ "uid": "6670f4cffc615a6257ab35dd",
100
+ "additionnal_agent_rules": "You have to search for the 'image generation' leaderboard before trying to get the best models. If you don't find it. Fail."
101
+ }
102
+ ]
103
+ },
104
+ {
105
+ "category": "text to video",
106
+ "emoji": "🎬",
107
+ "leaderboards": [
108
+ {
109
+ "uid": "6719d6a46937670ca681151e",
110
+ "additionnal_agent_rules": "",
111
+ "is_open_source": false
112
+ },
113
+ {
114
+ "uid": "6670f4cffc615a6257ab35dd",
115
+ "additionnal_agent_rules": "You have to search for the 'video generation' tab to access the leaderboard before trying to get the best models. If you don't find it. Fail."
116
+ }
117
+ ]
118
+ },
119
+ {
120
+ "category": "text to 3d",
121
+ "emoji": "🧊",
122
+ "leaderboards": [
123
+ {
124
+ "uid": "651f831f128d26b399db9ea5",
125
+ "additionnal_agent_rules": "",
126
+ "is_open_source": false
127
+ }
128
+ ]
129
+ },
130
+ {
131
+ "category": "text to speech",
132
+ "emoji": "🔊",
133
+ "leaderboards": [
134
+ {
135
+ "uid": "65a5a7c26145ebc6e7e39243",
136
+ "additionnal_agent_rules": "",
137
+ "is_open_source": false
138
+ }
139
+ ]
140
+ },
141
+ {
142
+ "category": "speech to text",
143
+ "emoji": "🎤",
144
+ "leaderboards": [
145
+ {
146
+ "uid": "64f9e6dd59eae6df399ba1e9",
147
+ "additionnal_agent_rules": "",
148
+ "is_open_source": true
149
+ }
150
+ ]
151
+ },
152
+ {
153
+ "category": "image to text",
154
+ "emoji": "📝",
155
+ "leaderboards": [
156
+ {
157
+ "uid": "65b0a64db233ea8ce65f0bc5",
158
+ "additionnal_agent_rules": "",
159
+ "is_open_source": false
160
+ }
161
+ ]
162
+ },
163
+ {
164
+ "category": "image background removal",
165
+ "emoji": "🖼️",
166
+ "leaderboards": [
167
+ {
168
+ "uid": "674eea98c6a6ef2849b4a0ac",
169
+ "additionnal_agent_rules": "",
170
+ "is_open_source": false
171
+ }
172
+ ]
173
+ },
174
+ {
175
+ "category": "medical QA tasks",
176
+ "emoji": "🩺",
177
+ "leaderboards": [
178
+ {
179
+ "uid": "65d70863ef58a69470ead2fc",
180
+ "additionnal_agent_rules": "",
181
+ "is_open_source": true
182
+ }
183
+ ]
184
+ }
185
+ ]
data/best_model_for_category_list.json ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "category": "text generation",
4
+ "emoji": "📝",
5
+ "leaderboards": [
6
+ {
7
+ "uid": "6468923b99182de17844bf7b",
8
+ "additionnal_agent_rules": "",
9
+ "is_open_source": false
10
+ },
11
+ {
12
+ "uid": "643d3016d2c1e08a5eca0c22",
13
+ "additionnal_agent_rules": "you have to check the 'only official providers' filter before trying to get the best models. It is mandatory to check this filter.",
14
+ "is_open_source": true
15
+ }
16
+ ]
17
+ },
18
+ {
19
+ "category": "uncensored text generation",
20
+ "emoji": "🔓",
21
+ "leaderboards": [
22
+ {
23
+ "uid": "65f0f612555caedb299e54d9",
24
+ "additionnal_agent_rules": "You have to remove models where are '(no longer available)'. Keep all the other models.",
25
+ "is_open_source": true
26
+ }
27
+ ]
28
+ },
29
+ {
30
+ "category": "image understanding",
31
+ "emoji": "📷",
32
+ "leaderboards": [
33
+ {
34
+ "uid": "6468923b99182de17844bf7b",
35
+ "additionnal_agent_rules": "you have to search for 'Arena (vision)' tab. You are searching for the best VLM models. It is mandatory to check this tab. If there is no informations about models in this tab, fail.",
36
+ "is_open_source": false
37
+ }
38
+ ]
39
+ },
40
+ {
41
+ "category": "agentic",
42
+ "emoji": "🤖",
43
+ "leaderboards": [
44
+ {
45
+ "uid": "67909d72a1832c8a7cdd4599",
46
+ "additionnal_agent_rules": "",
47
+ "is_open_source": false
48
+ }
49
+ ]
50
+ },
51
+ {
52
+ "category": "math",
53
+ "emoji": "🧮",
54
+ "leaderboards": [
55
+ {
56
+ "uid": "643d3016d2c1e08a5eca0c22",
57
+ "additionnal_agent_rules": "You have to click on MATH to sort the leaderboard by score. We are searching for the best math models.",
58
+ "is_open_source": true
59
+ }
60
+ ]
61
+ },
62
+ {
63
+ "category": "code",
64
+ "emoji": "💻",
65
+ "leaderboards": [
66
+ {
67
+ "uid": "6662b2c6cc6519da32cd6f4d",
68
+ "additionnal_agent_rules": "",
69
+ "is_open_source": false
70
+ },
71
+ {
72
+ "uid": "657b23848e7790a347c7e4ea",
73
+ "additionnal_agent_rules": "",
74
+ "is_open_source": false
75
+ }
76
+ ]
77
+ },
78
+ {
79
+ "category": "embedding",
80
+ "emoji": "📦",
81
+ "leaderboards": [
82
+ {
83
+ "uid": "633581939ac57cf2967be686",
84
+ "additionnal_agent_rules": "",
85
+ "is_open_source": false
86
+ }
87
+ ]
88
+ },
89
+ {
90
+ "category": "text to image",
91
+ "emoji": "🎨",
92
+ "leaderboards": [
93
+ {
94
+ "uid": "665e7241f8cb81b0a476eccb",
95
+ "additionnal_agent_rules": "",
96
+ "is_open_source": false
97
+ },
98
+ {
99
+ "uid": "6670f4cffc615a6257ab35dd",
100
+ "additionnal_agent_rules": "You have to search for the 'image generation' leaderboard before trying to get the best models. If you don't find it. Fail."
101
+ }
102
+ ]
103
+ },
104
+ {
105
+ "category": "text to video",
106
+ "emoji": "🎬",
107
+ "leaderboards": [
108
+ {
109
+ "uid": "65adcd10d6b10af9119fc960",
110
+ "additionnal_agent_rules": "",
111
+ "is_open_source": false
112
+ },
113
+ {
114
+ "uid": "6719d6a46937670ca681151e",
115
+ "additionnal_agent_rules": "",
116
+ "is_open_source": false
117
+ },
118
+ {
119
+ "uid": "6670f4cffc615a6257ab35dd",
120
+ "additionnal_agent_rules": "You have to search for the 'video generation' tab to access the leaderboard before trying to get the best models. If you don't find it. Fail."
121
+ }
122
+ ]
123
+ },
124
+ {
125
+ "category": "text to 3d",
126
+ "emoji": "🧊",
127
+ "leaderboards": [
128
+ {
129
+ "uid": "651f831f128d26b399db9ea5",
130
+ "additionnal_agent_rules": "",
131
+ "is_open_source": false
132
+ }
133
+ ]
134
+ },
135
+ {
136
+ "category": "text to speech",
137
+ "emoji": "🔊",
138
+ "leaderboards": [
139
+ {
140
+ "uid": "65a5a7c26145ebc6e7e39243",
141
+ "additionnal_agent_rules": "",
142
+ "is_open_source": false
143
+ }
144
+ ]
145
+ },
146
+ {
147
+ "category": "speech to text",
148
+ "emoji": "🎤",
149
+ "leaderboards": [
150
+ {
151
+ "uid": "64f9e6dd59eae6df399ba1e9",
152
+ "additionnal_agent_rules": "",
153
+ "is_open_source": true
154
+ }
155
+ ]
156
+ },
157
+ {
158
+ "category": "image to text",
159
+ "emoji": "📝",
160
+ "leaderboards": [
161
+ {
162
+ "uid": "65b0a64db233ea8ce65f0bc5",
163
+ "additionnal_agent_rules": "",
164
+ "is_open_source": false
165
+ }
166
+ ]
167
+ },
168
+ {
169
+ "category": "image background removal",
170
+ "emoji": "🖼️",
171
+ "leaderboards": [
172
+ {
173
+ "uid": "674eea98c6a6ef2849b4a0ac",
174
+ "additionnal_agent_rules": "",
175
+ "is_open_source": false
176
+ }
177
+ ]
178
+ },
179
+ {
180
+ "category": "medical QA tasks",
181
+ "emoji": "🩺",
182
+ "leaderboards": [
183
+ {
184
+ "uid": "65d70863ef58a69470ead2fc",
185
+ "additionnal_agent_rules": "",
186
+ "is_open_source": true
187
+ }
188
+ ]
189
+ }
190
+ ]
data/best_model_for_results.json.lock ADDED
File without changes
experiments/simple_smolagent.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Simple SmoLAgent - Un script simple pour tester l'authentification avec Hugging Face
6
+ """
7
+
8
+ import os
9
+ import asyncio
10
+ from dotenv import load_dotenv
11
+ from huggingface_hub import login
12
+ from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel
13
+
14
+ # Charger les variables d'environnement depuis le fichier .env
15
+ load_dotenv()
16
+
17
+ # Récupérer le token Hugging Face
18
+ hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
19
+
20
+ def main():
21
+ """Fonction principale pour tester l'authentification avec Hugging Face."""
22
+
23
+ agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=HfApiModel())
24
+
25
+ response = agent.run("Search for the best music recommendations for a party at the Wayne's mansion.")
26
+ # Tester l'agent avec une requête simple
27
+ print(f"Réponse de l'agent: {response}")
28
+
29
+
30
+ if __name__ == "__main__":
31
+ main()
experiments/smolagent_parser.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ SmoLAgent Parser - Extrait le premier modèle de chaque leaderboard Hugging Face
6
+ en utilisant Playwright et smolagents.
7
+ """
8
+
9
+ import json
10
+ import os
11
+ import asyncio
12
+ from pathlib import Path
13
+ from typing import List, Dict, Any, Optional
14
+ from dotenv import load_dotenv
15
+ from huggingface_hub import login
16
+
17
+ from playwright.async_api import async_playwright
18
+ from smolagents import CodeAgent
19
+ from smolagents.models import HfApiModel
20
+ from smolagents.tools import Tool
21
+
22
+ # Charger les variables d'environnement depuis le fichier .env
23
+ load_dotenv()
24
+
25
+ # Récupérer le token Hugging Face
26
+ hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
27
+
28
+ # Charger les leaderboards depuis le fichier JSON
29
+ def load_leaderboards() -> List[str]:
30
+ """Charger les URLs des leaderboards depuis le fichier JSON."""
31
+ with open("leaderboards.json", "r") as f:
32
+ return json.load(f)
33
+
34
+ # Définir un outil pour utiliser Playwright
35
+ class PlaywrightBrowserTool(Tool):
36
+ """Outil pour interagir avec un navigateur web via Playwright."""
37
+
38
+ name = "browser"
39
+ description = "Outil pour interagir avec un navigateur web via Playwright."
40
+ inputs = {
41
+ "goto": {
42
+ "url": {
43
+ "type": "string",
44
+ "description": "L'URL vers laquelle naviguer"
45
+ }
46
+ },
47
+ "get_content": {},
48
+ "get_title": {},
49
+ "take_screenshot": {
50
+ "path": {
51
+ "type": "string",
52
+ "description": "Le chemin où enregistrer la capture d'écran"
53
+ }
54
+ },
55
+ "run_js": {
56
+ "script": {
57
+ "type": "string",
58
+ "description": "Le code JavaScript à exécuter dans le contexte de la page"
59
+ }
60
+ },
61
+ "wait_for": {
62
+ "selector": {
63
+ "type": "string",
64
+ "description": "Le sélecteur CSS à attendre"
65
+ },
66
+ "timeout": {
67
+ "type": "integer",
68
+ "description": "Le temps maximum d'attente en millisecondes"
69
+ }
70
+ },
71
+ "click": {
72
+ "selector": {
73
+ "type": "string",
74
+ "description": "Le sélecteur CSS de l'élément à cliquer"
75
+ }
76
+ },
77
+ "fill": {
78
+ "selector": {
79
+ "type": "string",
80
+ "description": "Le sélecteur CSS du champ de formulaire"
81
+ },
82
+ "value": {
83
+ "type": "string",
84
+ "description": "La valeur à remplir dans le champ de formulaire"
85
+ }
86
+ }
87
+ }
88
+ output_type = "any"
89
+
90
+ def __init__(self, page):
91
+ self.page = page
92
+
93
+ async def goto(self, url: str) -> str:
94
+ """Naviguer vers une URL."""
95
+ await self.page.goto(url, wait_until="networkidle", timeout=60000)
96
+ return f"Navigué vers {url}"
97
+
98
+ async def get_content(self) -> str:
99
+ """Obtenir le contenu HTML de la page."""
100
+ return await self.page.content()
101
+
102
+ async def get_title(self) -> str:
103
+ """Obtenir le titre de la page."""
104
+ return await self.page.title()
105
+
106
+ async def take_screenshot(self, path: str = "screenshot.png") -> str:
107
+ """Prendre une capture d'écran de la page."""
108
+ await self.page.screenshot(path=path)
109
+ return f"Capture d'écran enregistrée dans {path}"
110
+
111
+ async def run_js(self, script: str) -> Any:
112
+ """Exécuter du JavaScript dans le contexte de la page."""
113
+ return await self.page.evaluate(script)
114
+
115
+ async def wait_for(self, selector: str, timeout: int = 30000) -> str:
116
+ """Attendre qu'un élément correspondant au sélecteur apparaisse."""
117
+ await self.page.wait_for_selector(selector, timeout=timeout)
118
+ return f"Élément avec le sélecteur '{selector}' trouvé"
119
+
120
+ async def click(self, selector: str) -> str:
121
+ """Cliquer sur un élément correspondant au sélecteur."""
122
+ await self.page.click(selector)
123
+ return f"Cliqué sur l'élément avec le sélecteur '{selector}'"
124
+
125
+ async def fill(self, selector: str, value: str) -> str:
126
+ """Remplir un champ de formulaire."""
127
+ await self.page.fill(selector, value)
128
+ return f"Rempli '{value}' dans l'élément avec le sélecteur '{selector}'"
129
+
130
+ async def extract_first_model(url: str) -> Optional[Dict[str, Any]]:
131
+ """
132
+ Extraire le premier modèle d'un leaderboard en utilisant un agent.
133
+
134
+ Args:
135
+ url: L'URL du leaderboard
136
+
137
+ Returns:
138
+ Un dictionnaire contenant les informations sur le premier modèle, ou None si l'extraction a échoué
139
+ """
140
+ async with async_playwright() as p:
141
+ browser = await p.chromium.launch(headless=False) # Mettre à True pour la production
142
+ page = await browser.new_page()
143
+
144
+ try:
145
+ # Créer l'outil Playwright
146
+ browser_tool = PlaywrightBrowserTool(page)
147
+
148
+ # Créer l'agent
149
+ agent = CodeAgent(
150
+ tools=[browser_tool],
151
+ model=HfApiModel()
152
+ )
153
+
154
+ # Exécuter l'agent
155
+ prompt = f"""
156
+ Extrais les informations sur le premier modèle du leaderboard à l'URL suivante: {url}
157
+
158
+ Utilise l'outil browser pour naviguer sur la page et extraire les informations suivantes:
159
+ - Nom du modèle
160
+ - Score
161
+ - Position/rang
162
+ - Créateur/auteur
163
+
164
+ Retourne les informations sous forme de dictionnaire Python.
165
+ """
166
+
167
+ result = await agent.run(prompt)
168
+ print(f"Résultat brut de l'agent: {result}")
169
+
170
+ # Essayer de parser le résultat comme un dictionnaire
171
+ try:
172
+ # L'agent peut retourner une représentation textuelle d'un dictionnaire
173
+ if isinstance(result, str):
174
+ # Essayer de trouver une structure de dictionnaire dans la chaîne
175
+ import re
176
+ dict_match = re.search(r'\{.*\}', result, re.DOTALL)
177
+ if dict_match:
178
+ dict_str = dict_match.group(0)
179
+ # Remplacer les guillemets simples par des guillemets doubles pour un JSON valide
180
+ dict_str = dict_str.replace("'", '"')
181
+ return json.loads(dict_str)
182
+ return {"raw_result": result}
183
+ return result
184
+ except Exception as e:
185
+ print(f"Erreur lors du parsing du résultat: {e}")
186
+ return {"raw_result": str(result)}
187
+ except Exception as e:
188
+ print(f"Erreur lors de l'extraction des données de {url}: {e}")
189
+ await page.screenshot(path=f"error_{url.replace('://', '_').replace('/', '_')}.png")
190
+ return {"error": str(e)}
191
+ finally:
192
+ await browser.close()
193
+
194
+ async def main():
195
+ """Fonction principale pour traiter tous les leaderboards."""
196
+ # Se connecter à Hugging Face
197
+ if hf_token:
198
+ print("Token Hugging Face trouvé dans le fichier .env")
199
+ login(token=hf_token)
200
+ print("Connexion à Hugging Face réussie!")
201
+ else:
202
+ print("Erreur: Token Hugging Face non trouvé dans le fichier .env")
203
+ return
204
+
205
+ leaderboards = load_leaderboards()
206
+ results = {}
207
+
208
+ for url in leaderboards:
209
+ print(f"Traitement du leaderboard: {url}")
210
+ result = await extract_first_model(url)
211
+ results[url] = result
212
+ print(f"Résultat: {result}")
213
+
214
+ # Sauvegarder les résultats dans un fichier JSON
215
+ with open("results_smolagent.json", "w") as f:
216
+ json.dump(results, f, indent=2)
217
+
218
+ print(f"Résultats sauvegardés dans results_smolagent.json")
219
+
220
+ if __name__ == "__main__":
221
+ asyncio.run(main())
experiments/vision_web_browser.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from io import BytesIO
3
+ from time import sleep
4
+
5
+ import helium
6
+ from dotenv import load_dotenv
7
+ from PIL import Image
8
+ from selenium import webdriver
9
+ from selenium.webdriver.common.by import By
10
+ from selenium.webdriver.common.keys import Keys
11
+
12
+ from smolagents import CodeAgent, DuckDuckGoSearchTool, tool
13
+ from smolagents.agents import ActionStep
14
+ from smolagents.cli import load_model
15
+
16
+
17
+ github_request = """
18
+ I'm trying to find how hard I have to work to get a repo in github.com/trending.
19
+ Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
20
+ """ # The agent is able to achieve this request only when powered by GPT-4o or Claude-3.5-sonnet.
21
+
22
+ search_request = """
23
+ Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
24
+ """
25
+
26
+
27
+ def parse_arguments():
28
+ parser = argparse.ArgumentParser(description="Run a web browser automation script with a specified model.")
29
+ parser.add_argument(
30
+ "prompt",
31
+ type=str,
32
+ nargs="?", # Makes it optional
33
+ default=github_request,
34
+ help="The prompt to run with the agent",
35
+ )
36
+ parser.add_argument(
37
+ "--model-type",
38
+ type=str,
39
+ default="LiteLLMModel",
40
+ help="The model type to use (e.g., OpenAIServerModel, LiteLLMModel, TransformersModel, HfApiModel)",
41
+ )
42
+ parser.add_argument(
43
+ "--model-id",
44
+ type=str,
45
+ default="gpt-4o",
46
+ help="The model ID to use for the specified model type",
47
+ )
48
+ return parser.parse_args()
49
+
50
+
51
+ def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
52
+ sleep(1.0) # Let JavaScript animations happen before taking the screenshot
53
+ driver = helium.get_driver()
54
+ current_step = memory_step.step_number
55
+ if driver is not None:
56
+ for previous_memory_step in agent.memory.steps: # Remove previous screenshots from logs for lean processing
57
+ if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
58
+ previous_memory_step.observations_images = None
59
+ png_bytes = driver.get_screenshot_as_png()
60
+ image = Image.open(BytesIO(png_bytes))
61
+ print(f"Captured a browser screenshot: {image.size} pixels")
62
+ memory_step.observations_images = [image.copy()] # Create a copy to ensure it persists, important!
63
+
64
+ # Update observations with current URL
65
+ url_info = f"Current url: {driver.current_url}"
66
+ memory_step.observations = (
67
+ url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
68
+ )
69
+ return
70
+
71
+
72
+ @tool
73
+ def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
74
+ """
75
+ Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
76
+ Args:
77
+ text: The text to search for
78
+ nth_result: Which occurrence to jump to (default: 1)
79
+ """
80
+ elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
81
+ if nth_result > len(elements):
82
+ raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
83
+ result = f"Found {len(elements)} matches for '{text}'."
84
+ elem = elements[nth_result - 1]
85
+ driver.execute_script("arguments[0].scrollIntoView(true);", elem)
86
+ result += f"Focused on element {nth_result} of {len(elements)}"
87
+ return result
88
+
89
+
90
+ @tool
91
+ def go_back() -> None:
92
+ """Goes back to previous page."""
93
+ driver.back()
94
+
95
+
96
+ @tool
97
+ def close_popups() -> str:
98
+ """
99
+ Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners.
100
+ """
101
+ webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
102
+
103
+
104
+ def initialize_driver():
105
+ """Initialize the Selenium WebDriver."""
106
+ chrome_options = webdriver.ChromeOptions()
107
+ chrome_options.add_argument("--force-device-scale-factor=1")
108
+ chrome_options.add_argument("--window-size=1000,1350")
109
+ chrome_options.add_argument("--disable-pdf-viewer")
110
+ chrome_options.add_argument("--window-position=0,0")
111
+ return helium.start_chrome(headless=False, options=chrome_options)
112
+
113
+
114
+ def initialize_agent(model):
115
+ """Initialize the CodeAgent with the specified model."""
116
+ return CodeAgent(
117
+ tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],
118
+ model=model,
119
+ additional_authorized_imports=["helium"],
120
+ step_callbacks=[save_screenshot],
121
+ max_steps=20,
122
+ verbosity_level=2,
123
+ )
124
+
125
+
126
+ helium_instructions = """
127
+ Use your web_search tool when you want to get Google search results.
128
+ Then you can use helium to access websites. Don't use helium for Google search, only for navigating websites!
129
+ Don't bother about the helium driver, it's already managed.
130
+ We've already ran "from helium import *"
131
+ Then you can go to pages!
132
+ Code:
133
+ ```py
134
+ go_to('github.com/trending')
135
+ ```<end_code>
136
+
137
+ You can directly click clickable elements by inputting the text that appears on them.
138
+ Code:
139
+ ```py
140
+ click("Top products")
141
+ ```<end_code>
142
+
143
+ If it's a link:
144
+ Code:
145
+ ```py
146
+ click(Link("Top products"))
147
+ ```<end_code>
148
+
149
+ If you try to interact with an element and it's not found, you'll get a LookupError.
150
+ In general stop your action after each button click to see what happens on your screenshot.
151
+ Never try to login in a page.
152
+
153
+ To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
154
+ Code:
155
+ ```py
156
+ scroll_down(num_pixels=1200) # This will scroll one viewport down
157
+ ```<end_code>
158
+
159
+ When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
160
+ Just use your built-in tool `close_popups` to close them:
161
+ Code:
162
+ ```py
163
+ close_popups()
164
+ ```<end_code>
165
+
166
+ You can use .exists() to check for the existence of an element. For example:
167
+ Code:
168
+ ```py
169
+ if Text('Accept cookies?').exists():
170
+ click('I accept')
171
+ ```<end_code>
172
+
173
+ Proceed in several steps rather than trying to solve the task in one shot.
174
+ And at the end, only when you have your answer, return your final answer.
175
+ Code:
176
+ ```py
177
+ final_answer("YOUR_ANSWER_HERE")
178
+ ```<end_code>
179
+
180
+ If pages seem stuck on loading, you might have to wait, for instance `import time` and run `time.sleep(5.0)`. But don't overuse this!
181
+ To list elements on page, DO NOT try code-based element searches like 'contributors = find_all(S("ol > li"))': just look at the latest screenshot you have and read it visually, or use your tool search_item_ctrl_f.
182
+ Of course, you can act on buttons like a user would do when navigating.
183
+ After each code blob you write, you will be automatically provided with an updated screenshot of the browser and the current browser url.
184
+ But beware that the screenshot will only be taken at the end of the whole action, it won't see intermediate states.
185
+ Don't kill the browser.
186
+ When you have modals or cookie banners on screen, you should get rid of them before you can click anything else.
187
+ """
188
+
189
+
190
+ def main():
191
+ # Load environment variables
192
+ load_dotenv()
193
+
194
+ # Parse command line arguments
195
+ args = parse_arguments()
196
+
197
+ # Initialize the model based on the provided arguments
198
+ model = load_model(args.model_type, args.model_id)
199
+
200
+ global driver
201
+ driver = initialize_driver()
202
+ agent = initialize_agent(model)
203
+
204
+ # Run the agent with the provided prompt
205
+ agent.python_executor("from helium import *", agent.state)
206
+ agent.run(args.prompt + helium_instructions)
207
+
208
+
209
+ if __name__ == "__main__":
210
+ main()
main.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Main script for the leaderboard parser.
4
+ This script processes leaderboards specified in data/best_model_for_category_list.json file
5
+ by matching their UIDs with hosts in data/final_leaderboards.json.
6
+
7
+ Environment variables:
8
+ HUGGING_FACE_HUB_TOKEN: Authentication token for Hugging Face Hub (required)
9
+ HUGGING_FACE_STORAGE_REPO: Target dataset name on the Hub (optional, default: leaderboard-explorer/leaderboard_explorer)
10
+ LEADERBOARD_REPROCESS_INTERVAL_HOURS: Interval in hours between leaderboard processing runs (default: 24)
11
+ """
12
+ import argparse
13
+ import logging
14
+ from dotenv import load_dotenv
15
+ import uvicorn
16
+ import sys
17
+
18
+ # Import from src modules
19
+ from src.processor import process_leaderboards
20
+ from src.server import app, initialize_server
21
+ from src.scheduler import initialize_scheduler, start_scheduler
22
+
23
+ # Configure logging
24
+ logging.basicConfig(
25
+ level=logging.INFO,
26
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
27
+ )
28
+ logger = logging.getLogger("leaderboard-parser")
29
+
30
+ def main():
31
+ """
32
+ Main function to process leaderboards specified in best_model_for_category_list.json.
33
+ """
34
+ # Parse command line arguments
35
+ parser = argparse.ArgumentParser(description="Leaderboard Parser")
36
+ parser.add_argument("--clean", action="store_true", help="Clean the results file before starting")
37
+ parser.add_argument("--force-retry-uid", help="Force retry for a specific leaderboard UID")
38
+ parser.add_argument("--force-retry-category", help="Force retry for all leaderboards of a specific category")
39
+ parser.add_argument("--upload-only", action="store_true", help="Only upload local files to the Hub without processing leaderboards")
40
+ parser.add_argument("--local-only", action="store_true", help="Local mode only: do not download from the Hub and do not upload to the Hub")
41
+ parser.add_argument("--retry-rejected", action="store_true", help="Force reprocessing of rejected leaderboards even if it's been less than 24h")
42
+ parser.add_argument("--server", action="store_true", help="Run as a web server with scheduled processing")
43
+ args = parser.parse_args()
44
+
45
+ # Load environment variables
46
+ load_dotenv()
47
+
48
+ # Check if we should run in server mode
49
+ if args.server:
50
+ run_server_mode()
51
+ return
52
+
53
+ # Convert args to dict for process_leaderboards
54
+ args_dict = vars(args)
55
+
56
+ # Process the leaderboards
57
+ success, message = process_leaderboards(args_dict)
58
+
59
+ if success:
60
+ logger.info(message)
61
+ return 0
62
+ else:
63
+ logger.error(message)
64
+ return 1
65
+
66
+ def run_server_mode():
67
+ """Run the application in server mode with periodic processing"""
68
+ # Initialize server and scheduler with the process_leaderboards function
69
+ initialize_server(process_leaderboards)
70
+ initialize_scheduler(process_leaderboards)
71
+
72
+ # Start the scheduler thread
73
+ scheduler = start_scheduler()
74
+
75
+ try:
76
+ # Log startup information
77
+ logger.info("Running in server mode with periodic processing")
78
+
79
+ # Run the FastAPI server
80
+ uvicorn.run(app, host="0.0.0.0", port=8000)
81
+ except KeyboardInterrupt:
82
+ logger.info("Server stopped by user")
83
+ except Exception as e:
84
+ logger.error(f"Error running server: {e}")
85
+
86
+ if __name__ == "__main__":
87
+ sys.exit(main())
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "leaderboard-parser"
3
+ version = "0.1.0"
4
+ description = "Parser for Hugging Face leaderboards"
5
+ authors = ["Thibaud Frere"]
6
+ readme = "README.md"
7
+ packages = [{include = "src"}]
8
+ package-mode = false
9
+
10
+ [tool.poetry.dependencies]
11
+ python = ">=3.10,<3.14"
12
+ python-dotenv = "^1.0.1"
13
+ opentelemetry-sdk = "^1.30.0"
14
+ opentelemetry-exporter-otlp = "^1.30.0"
15
+ openinference-instrumentation-smolagents = "^0.1.6"
16
+ helium = "^5.1.1"
17
+ huggingface-hub = "^0.29.1"
18
+ fastapi = "^0.115.11"
19
+ uvicorn = "^0.34.0"
20
+ smolagents = {version = "^1.9.2", extras = ["litellm"]}
21
+
22
+ [tool.poetry.scripts]
23
+ leaderboard-parser = "main:main"
24
+
25
+ [build-system]
26
+ requires = ["poetry-core"]
27
+ build-backend = "poetry.core.masonry.api"
scripts/test_agent.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Script de test pour l'agent de parsing de leaderboard.
4
+ Ce script permet de tester l'agent en standalone en lui donnant une URL.
5
+
6
+ Usage:
7
+ python test_agent.py <url>
8
+
9
+ Exemple:
10
+ python test_agent.py https://lmarena-ai-chatbot-arena-leaderboard.hf.space
11
+ """
12
+ import json
13
+ import os
14
+ import sys
15
+ import argparse
16
+ import datetime
17
+ from dotenv import load_dotenv
18
+
19
+ # Assurez-vous que le répertoire parent est dans le chemin d'importation
20
+ script_dir = os.path.dirname(os.path.abspath(__file__))
21
+ root_dir = os.path.dirname(script_dir)
22
+ sys.path.insert(0, root_dir)
23
+
24
+ from src.agent import get_default_model, process_leaderboard
25
+ from src.browser import initialize_driver, close_driver, take_initial_screenshot
26
+
27
+
28
+ def main():
29
+ """
30
+ Fonction principale qui teste l'agent sur une URL spécifiée.
31
+ """
32
+ # Charger les variables d'environnement
33
+ load_dotenv()
34
+
35
+ # Parse command line arguments
36
+ parser = argparse.ArgumentParser(description="Test de l'agent de parsing de leaderboard")
37
+ parser.add_argument("url", help="URL du leaderboard à parser")
38
+ parser.add_argument("--save", help="Chemin où sauvegarder le résultat JSON (optionnel)")
39
+ parser.add_argument("--uid", help="UID à utiliser pour la capture d'écran (optionnel)")
40
+ parser.add_argument("--wait", type=int, default=10, help="Temps d'attente initial en secondes (défaut: 10)")
41
+ args = parser.parse_args()
42
+
43
+ # Assurez-vous que nous sommes dans le bon répertoire
44
+ os.chdir(root_dir)
45
+
46
+ # Vérifier que la clé API est disponible
47
+ if not os.getenv("OPENAI_API_KEY"):
48
+ print("ERREUR: La variable d'environnement OPENAI_API_KEY n'est pas définie.")
49
+ print("Veuillez créer un fichier .env à la racine du projet avec votre clé API.")
50
+ print("Exemple: OPENAI_API_KEY=votre-clé-api")
51
+ sys.exit(1)
52
+
53
+ # Créer le répertoire d'images si nécessaire
54
+ if args.uid:
55
+ images_dir = os.path.join("data", "images")
56
+ os.makedirs(images_dir, exist_ok=True)
57
+
58
+ # Obtenir le modèle par défaut
59
+ model = get_default_model()
60
+
61
+ print(f"Test de l'agent sur l'URL: {args.url}")
62
+ if args.uid:
63
+ print(f"UID utilisé pour la capture d'écran: {args.uid}")
64
+ print(f"Temps d'attente initial: {args.wait} secondes")
65
+
66
+ # Initialiser le navigateur et prendre une capture d'écran initiale avec le temps d'attente personnalisé
67
+ initialize_driver()
68
+
69
+ if args.uid:
70
+ # Remplacer la fonction take_initial_screenshot par une version personnalisée avec le temps d'attente spécifié
71
+ import time
72
+ from helium import go_to
73
+ from io import BytesIO
74
+ from PIL import Image
75
+
76
+ # Naviguer vers l'URL
77
+ go_to(args.url)
78
+
79
+ # Attendre que la page se charge
80
+ print(f"Attente de {args.wait} secondes pour le chargement complet de la page...")
81
+ time.sleep(args.wait)
82
+
83
+ # Prendre la capture d'écran
84
+ from src.browser import driver
85
+ png_bytes = driver.get_screenshot_as_png()
86
+ image = Image.open(BytesIO(png_bytes))
87
+
88
+ # Sauvegarder la capture d'écran
89
+ images_dir = os.path.join("data", "images")
90
+ os.makedirs(images_dir, exist_ok=True)
91
+ screenshot_path = os.path.join(images_dir, f"{args.uid}.png")
92
+ image.save(screenshot_path)
93
+
94
+ print(f"Capture d'écran initiale sauvegardée dans: {screenshot_path}")
95
+
96
+ # Fermer le navigateur pour le réinitialiser
97
+ close_driver()
98
+
99
+ # Traiter le leaderboard
100
+ result = process_leaderboard(args.url, model, 0, args.uid)
101
+
102
+ # Ajouter des métadonnées
103
+ result["url"] = args.url
104
+ if args.uid:
105
+ result["uid"] = args.uid
106
+ result["screenshot"] = f"images/{args.uid}.png" if os.path.exists(os.path.join("data", "images", f"{args.uid}.png")) else None
107
+
108
+ # Afficher le résultat en JSON
109
+ json_result = json.dumps(result, indent=2)
110
+ print("\nRésultat JSON:")
111
+ print(json_result)
112
+
113
+ # Sauvegarder le résultat si demandé
114
+ if args.save:
115
+ with open(args.save, "w") as f:
116
+ f.write(json_result)
117
+ print(f"\nRésultat sauvegardé dans: {args.save}")
118
+
119
+
120
+ if __name__ == "__main__":
121
+ main()
src/__pycache__/agent.cpython-310.pyc ADDED
Binary file (13 kB). View file
 
src/__pycache__/browser.cpython-310.pyc ADDED
Binary file (2.27 kB). View file
 
src/__pycache__/browser_utils.cpython-310.pyc ADDED
Binary file (3.56 kB). View file
 
src/__pycache__/file_utils.cpython-310.pyc ADDED
Binary file (8.2 kB). View file
 
src/__pycache__/hub_utils.cpython-310.pyc ADDED
Binary file (4.29 kB). View file
 
src/__pycache__/leaderboard_processor.cpython-310.pyc ADDED
Binary file (3.65 kB). View file
 
src/__pycache__/processor.cpython-310.pyc ADDED
Binary file (9.87 kB). View file
 
src/__pycache__/scheduler.cpython-310.pyc ADDED
Binary file (2.52 kB). View file
 
src/__pycache__/server.cpython-310.pyc ADDED
Binary file (2.44 kB). View file
 
src/__pycache__/tools.cpython-310.pyc ADDED
Binary file (11.3 kB). View file
 
src/agents/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (413 Bytes). View file
 
src/agents/__pycache__/agent.cpython-310.pyc ADDED
Binary file (13.1 kB). View file
 
src/agents/__pycache__/agent_core.cpython-310.pyc ADDED
Binary file (1.63 kB). View file
 
src/agents/__pycache__/agent_instructions.cpython-310.pyc ADDED
Binary file (11.2 kB). View file
 
src/agents/__pycache__/agent_processor.cpython-310.pyc ADDED
Binary file (5.58 kB). View file
 
src/agents/__pycache__/agent_tools.cpython-310.pyc ADDED
Binary file (13.2 kB). View file
 
src/agents/__pycache__/browser.cpython-310.pyc ADDED
Binary file (4.05 kB). View file
 
src/agents/__pycache__/prompts.cpython-310.pyc ADDED
Binary file (11 kB). View file
 
src/agents/__pycache__/tools.cpython-310.pyc ADDED
Binary file (13.6 kB). View file
 
src/agents/__pycache__/validators.cpython-310.pyc ADDED
Binary file (3.28 kB). View file
 
src/agents/browser.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Browser management for the leaderboard agent.
3
+ """
4
+ import os
5
+ import gc
6
+ import logging
7
+ from io import BytesIO
8
+ from time import sleep
9
+
10
+ import helium
11
+ from PIL import Image
12
+ from selenium import webdriver
13
+ from smolagents import CodeAgent
14
+ from smolagents.agents import ActionStep
15
+
16
+ # Configuration du logger
17
+ logger = logging.getLogger("leaderboard-parser")
18
+
19
+ # Global driver variable
20
+ driver = None
21
+
22
+
23
+ def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
24
+ """
25
+ Save a screenshot of the current browser state in memory for the agent.
26
+ This is used as a callback for the agent to visualize the page.
27
+ The screenshot is only kept in memory and not saved to disk.
28
+ """
29
+ sleep(2.0) # Increased to allow time for JavaScript animations
30
+ current_step = memory_step.step_number
31
+ if driver is not None:
32
+ for previous_memory_step in agent.memory.steps: # Remove previous screenshots from logs for lean processing
33
+ if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
34
+ previous_memory_step.observations_images = None
35
+
36
+ # Capture screenshot for agent visualization only (not saved to disk)
37
+ png_bytes = driver.get_screenshot_as_png()
38
+ image = Image.open(BytesIO(png_bytes))
39
+ print(f"Captured a browser screenshot for agent: {image.size} pixels")
40
+ memory_step.observations_images = [image.copy()] # Create a copy to ensure it persists, important!
41
+
42
+ # Update observations with current URL
43
+ url_info = f"Current url: {driver.current_url}"
44
+ memory_step.observations = (
45
+ url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
46
+ )
47
+ return
48
+
49
+
50
+ def initialize_driver():
51
+ """
52
+ Initialize the Selenium WebDriver.
53
+ Returns a configured Chrome WebDriver instance.
54
+ """
55
+ global driver
56
+
57
+ # Si le driver existe déjà, on le nettoie d'abord pour éviter les fuites mémoire
58
+ if driver is not None:
59
+ close_driver()
60
+
61
+ print("Démarrage de l'initialisation du navigateur Chrome...")
62
+
63
+ chrome_options = webdriver.ChromeOptions()
64
+ chrome_options.add_argument("--force-device-scale-factor=1")
65
+ chrome_options.add_argument("--window-size=1600,1400")
66
+ chrome_options.add_argument("--disable-pdf-viewer")
67
+ chrome_options.add_argument("--window-position=0,0")
68
+
69
+ # Options essentielles pour l'environnement conteneurisé
70
+ chrome_options.add_argument("--no-sandbox")
71
+ chrome_options.add_argument("--disable-dev-shm-usage")
72
+ chrome_options.add_argument("--disable-gpu")
73
+ chrome_options.add_argument("--disable-extensions")
74
+ chrome_options.add_argument("--disable-software-rasterizer")
75
+ chrome_options.add_argument("--no-first-run")
76
+ chrome_options.add_argument("--no-zygote")
77
+ chrome_options.add_argument("--single-process")
78
+
79
+ # Path to Chrome binary
80
+ chrome_path = os.environ.get("CHROME_PATH", "/usr/bin/google-chrome-stable")
81
+ if os.path.exists(chrome_path):
82
+ print(f"Utilisation de Chrome à l'emplacement: {chrome_path}")
83
+ chrome_options.binary_location = chrome_path
84
+
85
+ # Afficher les options pour le diagnostic
86
+ print(f"Options Chrome configurées: {chrome_options.arguments}")
87
+
88
+ try:
89
+ print("Tentative de démarrage de Chrome avec Helium...")
90
+ driver = helium.start_chrome(headless=True, options=chrome_options)
91
+ print("Chrome démarré avec succès!")
92
+
93
+ # Informations sur le navigateur
94
+ print(f"Version de Chrome: {driver.capabilities.get('browserVersion', 'Inconnue')}")
95
+ print(f"Plateforme: {driver.capabilities.get('platformName', 'Inconnue')}")
96
+
97
+ # Set page load timeout
98
+ driver.set_page_load_timeout(30) # Increased to 30 seconds
99
+
100
+ return driver
101
+ except Exception as e:
102
+ print(f"ERREUR lors du démarrage de Chrome: {str(e)}")
103
+ # Capturer la trace complète pour diagnostic
104
+ import traceback
105
+ print("Trace d'erreur complète:")
106
+ traceback.print_exc()
107
+
108
+ # Vérifier si Chrome est disponible
109
+ try:
110
+ import subprocess
111
+ chrome_version_cmd = f"{chrome_path} --version"
112
+ version_output = subprocess.check_output(chrome_version_cmd, shell=True, stderr=subprocess.STDOUT).decode()
113
+ print(f"Version de Chrome installée: {version_output.strip()}")
114
+ except Exception as chrome_check_error:
115
+ print(f"Impossible de vérifier la version de Chrome: {str(chrome_check_error)}")
116
+
117
+ raise
118
+
119
+
120
+ def close_driver():
121
+ """
122
+ Close the browser and clean up resources.
123
+ """
124
+ global driver
125
+
126
+ try:
127
+ print("Fermeture du navigateur et nettoyage des ressources...")
128
+
129
+ # Utiliser helium.kill_browser() pour fermer proprement le navigateur
130
+ helium.kill_browser()
131
+
132
+ # Libérer la référence
133
+ driver = None
134
+
135
+ # Forcer le garbage collector
136
+ gc.collect()
137
+
138
+ print("Navigateur fermé avec succès")
139
+ except Exception as e:
140
+ print(f"Error closing browser: {e}")
141
+
142
+
143
+ # Alias de close_driver pour compatibilité avec browser_utils.cleanup_browser
144
+ def cleanup_browser():
145
+ """
146
+ Alias de close_driver pour compatibilité avec l'API existante.
147
+ """
148
+ close_driver()
src/agents/fact_checker/fact_checker_agent.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Agent management for the agent leaderboard parser fact checker.
3
+ """
src/agents/parser/__pycache__/agent.cpython-310.pyc ADDED
Binary file (13.1 kB). View file
 
src/agents/parser/__pycache__/parser_agent.cpython-310.pyc ADDED
Binary file (13.2 kB). View file
 
src/agents/parser/parser_agent.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agent management for the leaderboard parser.
3
+ """
4
+ import datetime
5
+ from smolagents import CodeAgent
6
+ from smolagents.cli import load_model
7
+
8
+ from src.agents.browser import save_screenshot
9
+ from src.agents.tools import (
10
+ map_clickable_elements,
11
+ close_popups,
12
+ extract_table_data,
13
+ find_leaderboard_elements,
14
+ go_back,
15
+ search_item_ctrl_f,
16
+ copy_link_from_element,
17
+ validate_json_results,
18
+ find_model_links,
19
+ click_at_coordinates,
20
+ )
21
+
22
+
23
+ def initialize_agent(model):
24
+ """
25
+ Initialize the CodeAgent with the specified model.
26
+
27
+ Args:
28
+ model: The LLM model to use for the agent
29
+
30
+ Returns:
31
+ A configured CodeAgent instance
32
+ """
33
+ return CodeAgent(
34
+ tools=[go_back, map_clickable_elements, validate_json_results, close_popups, search_item_ctrl_f, extract_table_data, find_leaderboard_elements, copy_link_from_element, find_model_links, click_at_coordinates],
35
+ model=model,
36
+ additional_authorized_imports=["selenium", "helium", "time", "json", "re"],
37
+ step_callbacks=[save_screenshot],
38
+ max_steps=25,
39
+ verbosity_level=2,
40
+ )
41
+
42
+
43
+ def get_default_model():
44
+ """
45
+ Get the default model for the agent.
46
+
47
+ Returns:
48
+ A configured model instance
49
+ """
50
+ model_type = "LiteLLMModel"
51
+ model_id = "gpt-4o"
52
+ return load_model(model_type, model_id)
53
+
54
+
55
+ # Instructions for the agent
56
+ leaderboard_instructions = """
57
+ Your task is to extract the three BEST models from the leaderboard. It is crucial that you identify the models that are at the top of the ranking, not just any three models present on the page.
58
+
59
+ You must also identify the main criterion on which the models are evaluated (for example: accuracy, speed, performance on a specific benchmark, etc.). Formulate a short description (less than 60 words) that explains what the models are judged on.
60
+
61
+ For each model, try to find a link to its page or repository. This can be any link (GitHub, Hugging Face, model website, etc.). If you cannot find a link for a model, indicate null for this field.
62
+
63
+ IMPORTANT: If you fail to clearly identify the top three models AND the evaluation criterion, the leaderboard will be rejected. It is essential that you provide this information accurately and completely.
64
+
65
+ You can use helium to navigate the website. We have already executed "from helium import *".
66
+ You can go to pages with:
67
+ ```py
68
+ go_to('url')
69
+ ```<end_code>
70
+
71
+ You can click on clickable elements by entering the text that appears on them:
72
+ ```py
73
+ click("Button text")
74
+ ```<end_code>
75
+
76
+ If it's a link:
77
+ ```py
78
+ click(Link("Link text"))
79
+ ```<end_code>
80
+
81
+ To scroll up or down, use scroll_down or scroll_up with the number of pixels as an argument:
82
+ ```py
83
+ scroll_down(num_pixels=1200) # This will scroll down one view
84
+ ```<end_code>
85
+
86
+ To close popups with an X icon, use the built-in tool `close_popups`:
87
+ ```py
88
+ close_popups()
89
+ ```<end_code>
90
+
91
+ You can use .exists() to check for the existence of an element:
92
+ ```py
93
+ if Text('Accept cookies?').exists():
94
+ click('I accept')
95
+ ```<end_code>
96
+
97
+ If you encounter situations where you cannot click on elements using text, you can use click_at_coordinates to click at specific x,y coordinates on the page:
98
+ ```py
99
+ click_at_coordinates(x=500, y=300) # Click at the position 500px from left, 300px from top
100
+ ```<end_code>
101
+
102
+ If pages seem stuck while loading, you may need to wait:
103
+ ```py
104
+ import time
105
+ time.sleep(20.0) # Wait at least 10 seconds for the initial loading
106
+ ```<end_code>
107
+
108
+ To extract data from a table, use the extract_table_data tool:
109
+ ```py
110
+ table_info = extract_table_data()
111
+ print(table_info)
112
+ ```<end_code>
113
+
114
+ If you cannot easily find a standard table, use find_leaderboard_elements to search for elements that might contain ranking data:
115
+ ```py
116
+ leaderboard_elements = find_leaderboard_elements()
117
+ print(leaderboard_elements)
118
+ ```<end_code>
119
+
120
+ RECOMMENDED METHODS FOR FINDING MODEL LINKS:
121
+
122
+ ```py
123
+ # For a model named "BERT-Large"
124
+ model_name = "BERT-Large"
125
+ links_info = find_model_links(model_name)
126
+ print(links_info)
127
+
128
+ # If links were found, the best candidate is displayed at the end of the result
129
+ if "Best candidate for" in links_info:
130
+ # Extract the URL of the best candidate
131
+ best_url_line = links_info.split("Best candidate for")[1].split("\n")[1]
132
+ url = best_url_line.replace("URL:", "").strip()
133
+ print(f"URL for model {model_name}: {url}")
134
+ else:
135
+ print(f"No link found for model {model_name}")
136
+ url = None
137
+ ```<end_code>
138
+
139
+
140
+ IMPORTANT: If none of the methods can find a URL, do NOT try other methods such as extracting URLs from the source code. Simply use null for the model URL. It is better to have a missing URL (null) than an incorrect or irrelevant URL.
141
+
142
+ IMPORTANT - PAGE EXPLORATION ORDER:
143
+ If you don't immediately see the leaderboard table or ranking information, STRICTLY follow this order:
144
+
145
+ 1. ABSOLUTE PRIORITY:
146
+ Look for and click on buttons, tabs, or links with text like "Leaderboard", "Results", "Ranking", "Benchmark", "Scores", "Evaluation", etc.
147
+ Examine ALL visible buttons and tabs before moving to the next step.
148
+ IMPORTANT: Be flexible with text matching! Some elements may contain emojis or other characters before/after the keywords.
149
+
150
+ ```py
151
+ # Examples of searching for leaderboard buttons/tabs
152
+ for text in ["🏆 Leaderboard", "Leaderboard", "Results", "Ranking", "Benchmark", "Scores", "Evaluation", "Performance"]:
153
+ if Button(text).exists() or Link(text).exists() or Text(text).exists():
154
+ print(f"Found clickable element: {text}")
155
+ click(text)
156
+ time.sleep(5) # Wait for the page to update
157
+ break
158
+
159
+ # If exact matches fail, try more flexible matching
160
+ # This is crucial for elements with emojis or other characters
161
+ if True: # This will execute if no exact match was found above
162
+ print("No exact matches found. Trying flexible text matching...")
163
+ import time
164
+ from src.agents.browser import driver
165
+ from selenium.webdriver.common.by import By
166
+
167
+ for text in ["🏆 Leaderboard", "Leaderboard", "Results", "Ranking", "Benchmark", "Scores"]:
168
+ # Try to find elements CONTAINING the text (not exact match)
169
+ matching_elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
170
+
171
+ if matching_elements:
172
+ print(f"Found {len(matching_elements)} elements containing '{text}'")
173
+ for element in matching_elements[:3]: # Try first three matches
174
+ try:
175
+ element_text = element.text
176
+ print(f"Element text: '{element_text}'")
177
+ driver.execute_script("arguments[0].scrollIntoView(true);", element)
178
+ time.sleep(1)
179
+ element.click()
180
+ print(f"Successfully clicked on element with text: '{element_text}'")
181
+ time.sleep(5)
182
+ break
183
+ except Exception as e:
184
+ print(f"Could not click: {e}")
185
+ # Try JavaScript click as fallback
186
+ try:
187
+ driver.execute_script("arguments[0].click();", element)
188
+ print(f"Clicked using JavaScript on element with text: '{element_text}'")
189
+ time.sleep(5)
190
+ break
191
+ except:
192
+ continue
193
+ ```<end_code>
194
+
195
+ 2. ONLY AFTER checking all buttons and tabs, scroll down to see if the content is lower down:
196
+ ```py
197
+ scroll_down(1200) # Try scrolling to see more content
198
+ ```<end_code>
199
+
200
+ 3. Check if there are dropdown menus or filters to activate
201
+ 4. Explore the different sections of the page
202
+
203
+ Proceed step by step:
204
+ 1. Navigate to the provided URL
205
+ 2. Wait for the page to load completely (use time.sleep(20.0))
206
+ 3. EXPLORE the page by STRICTLY following the order above (first buttons/tabs, then scroll if necessary)
207
+ 4. Look for the table or section containing the model ranking
208
+ 5. Identify the three BEST models (those at the top of the ranking) ( DO NOT CHANGE MODEL NAMES UNDER ANY CIRCUMSTANCES )
209
+ 6. Determine the main evaluation criterion for the models
210
+ 7. IMPORTANT : For each identified model, use the method described above to find its URL. If the URL is not found, use null.
211
+ 8. If you cannot find links at the first try for any reason, you can try again with the same method if you want.
212
+ 9. Validate the results using the validate_json_results tool. VERY IMPORTANT TO DO BEFORE SENDING RESULTS.
213
+ 10. Send final results
214
+
215
+ ```py
216
+ final_answer({
217
+ "top_models": [
218
+ {"rank": 1, "name": "Model name 1", "url": "Model URL or null if not available"},
219
+ {"rank": 2, "name": "Model name 2", "url": "Model URL or null if not available"},
220
+ {"rank": 3, "name": "Model name 3", "url": "Model URL or null if not available"}
221
+ ],
222
+ "evaluation_criteria": "Short description of the evaluation criterion (less than 60 words)"
223
+ })
224
+ ```<end_code>
225
+
226
+ After each block of code you write, you will automatically receive an updated screenshot of the browser and the current URL of the browser.
227
+ But be careful, the screenshot will only be taken at the end of the complete action, it will not see intermediate states.
228
+
229
+ IMPORTANT: DO NOT CHANGE MODEL NAMES UNDER ANY CIRCUMSTANCES
230
+ """
231
+
232
+
233
+ def validate_results(result):
234
+ """Checks that the results do not contain generic placeholders."""
235
+ if not result or not isinstance(result, dict):
236
+ return False, "Invalid result"
237
+
238
+ if "top_models" not in result or len(result.get("top_models", [])) < 3:
239
+ return False, "Less than 3 models found"
240
+
241
+ # Check for generic names
242
+ generic_names = ["model a", "model b", "model c", "model 1", "model 2", "model 3", "model name", "unavailable"]
243
+ model_names = [m.get("name", "").lower() for m in result.get("top_models", [])]
244
+ if any(name in generic_names for name in model_names):
245
+ return False, "Generic model names detected"
246
+
247
+ # Check for generic URLs
248
+ generic_urls = ["example.com", "example.org"]
249
+ model_urls = [m.get("url", "").lower() for m in result.get("top_models", []) if m.get("url") is not None]
250
+ if any(generic in url for url in model_urls for generic in generic_urls):
251
+ return False, "Generic URLs detected"
252
+
253
+ # Check the evaluation criterion
254
+ if "evaluation_criteria" not in result or len(result.get("evaluation_criteria", "")) < 10:
255
+ return False, "Evaluation criterion missing or too short"
256
+
257
+ return True, "Valid results"
258
+
259
+
260
+ def process_leaderboard(url, model, index, uid=None, additional_rules=None):
261
+ """
262
+ Process a single leaderboard URL and return the results.
263
+
264
+ Args:
265
+ url: The URL of the leaderboard to process
266
+ model: The LLM model to use
267
+ index: The index of the leaderboard in the list
268
+ uid: The UID of the leaderboard (for saving screenshots)
269
+ additional_rules: Additional rules specific to this leaderboard
270
+
271
+ Returns:
272
+ A dictionary with the results or error information
273
+ """
274
+ from src.agents.browser import initialize_driver, close_driver
275
+
276
+ print(f"\n\n{'='*50}")
277
+ print(f"Processing leaderboard {index+1}: {url}")
278
+ if uid:
279
+ print(f"UID: {uid}")
280
+ if additional_rules:
281
+ print(f"Additional rules: {additional_rules}")
282
+ print(f"{'='*50}\n")
283
+
284
+ # Get current date and time
285
+ now = datetime.datetime.now()
286
+ parsed_at = now.isoformat()
287
+
288
+ initialize_driver()
289
+
290
+ agent = initialize_agent(model)
291
+
292
+ # Create the prompt with the target URL
293
+ prompt = f"Visit {url} and extract the three BEST models from the leaderboard (those at the top of the ranking). Also identify the main evaluation criterion for the models and look for links associated with the models."
294
+
295
+ # Add additional rules if provided
296
+ instructions = leaderboard_instructions
297
+ if additional_rules:
298
+ instructions = f"""
299
+
300
+ ADDITIONAL RULES SPECIFIC TO THIS LEADERBOARD:
301
+ {additional_rules}
302
+
303
+ {leaderboard_instructions}
304
+
305
+ ADDITIONAL RULES SPECIFIC TO THIS LEADERBOARD:
306
+ {additional_rules}
307
+ """
308
+
309
+ try:
310
+ # Run the agent with the provided prompt
311
+ agent.python_executor("from helium import *")
312
+ result = agent.run(prompt + instructions)
313
+
314
+ print(f"\nResult for {url}:")
315
+ print(result)
316
+
317
+ # Check if the result is None or empty
318
+ if not result:
319
+ return {
320
+ "results": None,
321
+ "parsing_status": "error",
322
+ "parsing_message": "Empty result from agent",
323
+ "parsed_at": parsed_at
324
+ }
325
+
326
+ # Validate the results
327
+ is_valid, reason = validate_results(result)
328
+ if not is_valid:
329
+ print(f"WARNING: {reason}")
330
+ return {
331
+ "results": result,
332
+ "parsing_status": "invalid",
333
+ "parsing_message": reason,
334
+ "parsed_at": parsed_at
335
+ }
336
+
337
+ # Make sure the response is in the correct format
338
+ if not isinstance(result, dict) or "top_models" not in result:
339
+ print("WARNING: Agent did not use final_answer() correctly")
340
+ return {
341
+ "results": None,
342
+ "parsing_status": "error",
343
+ "parsing_message": "Agent returned improperly formatted response (did not use final_answer correctly)",
344
+ "parsed_at": parsed_at
345
+ }
346
+
347
+ return {
348
+ "results": result,
349
+ "parsing_status": "success",
350
+ "parsed_at": parsed_at
351
+ }
352
+ except Exception as e:
353
+ print(f"An error occurred while processing {url}: {e}")
354
+ return {
355
+ "results": None,
356
+ "parsing_status": "error",
357
+ "parsing_message": str(e),
358
+ "parsed_at": parsed_at
359
+ }
360
+ finally:
361
+ # Ensure browser is closed
362
+ close_driver()
src/agents/tools.py ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tools for the leaderboard agent.
3
+ """
4
+ from selenium import webdriver
5
+ from selenium.webdriver.common.by import By
6
+ from selenium.webdriver.common.keys import Keys
7
+ from selenium.webdriver.common.action_chains import ActionChains
8
+ import re
9
+ import time
10
+ import helium
11
+
12
+ from smolagents import tool
13
+
14
+
15
+ @tool
16
+ def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
17
+ """
18
+ Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
19
+ Args:
20
+ text: The text to search for
21
+ nth_result: Which occurrence to jump to (default: 1)
22
+ """
23
+ from src.agents.browser import driver
24
+
25
+ elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
26
+ if nth_result > len(elements):
27
+ raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
28
+ result = f"Found {len(elements)} matches for '{text}'."
29
+ elem = elements[nth_result - 1]
30
+ driver.execute_script("arguments[0].scrollIntoView(true);", elem)
31
+ result += f"Focused on element {nth_result} of {len(elements)}"
32
+ return result
33
+
34
+
35
+ @tool
36
+ def go_back() -> str:
37
+ """
38
+ Navigate back to the previous page.
39
+ """
40
+ from src.agents.browser import driver
41
+
42
+ driver.back()
43
+ time.sleep(2) # Wait for page to load
44
+ return "Navigated back to previous page"
45
+
46
+
47
+ @tool
48
+ def close_popups() -> str:
49
+ """
50
+ Closes any popup/modal dialogs that might be open on the page.
51
+ Useful when pop-ups appear (cookies, login prompts, etc.) that block interaction.
52
+ """
53
+ from src.agents.browser import driver
54
+
55
+ # Try to find common popup elements
56
+ popup_selectors = [
57
+ "//button[contains(text(), 'Accept')]",
58
+ "//button[contains(text(), 'Close')]",
59
+ "//button[contains(text(), 'Fermer')]",
60
+ "//button[contains(text(), 'OK')]",
61
+ "//button[contains(text(), 'Got it')]",
62
+ "//button[contains(@class, 'close')]",
63
+ "//div[contains(@class, 'popup')]//button",
64
+ "//div[contains(@class, 'modal')]//button",
65
+ "//div[contains(@class, 'dialog')]//button"
66
+ ]
67
+
68
+ found = False
69
+ for selector in popup_selectors:
70
+ try:
71
+ popup_elements = driver.find_elements(By.XPATH, selector)
72
+ for elem in popup_elements:
73
+ if elem.is_displayed():
74
+ elem.click()
75
+ found = True
76
+ time.sleep(0.5) # Wait for popup to disappear
77
+ except Exception as e:
78
+ pass # Ignore errors, try next selector
79
+
80
+ return "Closed popup dialogs" if found else "No popup dialogs found"
81
+
82
+
83
+ @tool
84
+ def extract_table_data(table_caption: str = None, table_index: int = 1) -> str:
85
+ """
86
+ Extracts data from a table on the page. Can find a table by caption/title or by index.
87
+ Args:
88
+ table_caption: Text in or near the table to find (default: None - will use index)
89
+ table_index: The index of the table if caption is not provided (1-based)
90
+ """
91
+ from src.agents.browser import driver
92
+
93
+ tables = driver.find_elements(By.TAG_NAME, "table")
94
+ if not tables:
95
+ return "No tables found on the page."
96
+
97
+ result = f"Found {len(tables)} table(s) on the page.\n"
98
+
99
+ for i, table in enumerate(tables):
100
+ result += f"\nTable {i+1}:\n"
101
+
102
+ # Try to get headers
103
+ headers = table.find_elements(By.TAG_NAME, "th")
104
+ if headers:
105
+ header_texts = [header.text for header in headers]
106
+ result += f"Headers: {', '.join(header_texts)}\n"
107
+
108
+ # Get rows
109
+ rows = table.find_elements(By.TAG_NAME, "tr")
110
+ result += f"Found {len(rows)} rows.\n"
111
+
112
+ # Get first 5 rows as sample
113
+ for j, row in enumerate(rows[:5]):
114
+ cells = row.find_elements(By.TAG_NAME, "td")
115
+ if cells:
116
+ cell_texts = [cell.text for cell in cells]
117
+ result += f"Row {j+1}: {' | '.join(cell_texts)}\n"
118
+
119
+ return result
120
+
121
+
122
+ @tool
123
+ def find_leaderboard_elements() -> str:
124
+ """
125
+ Find key elements of a leaderboard: title, evaluation criteria, and model rankings.
126
+ Returns a structured description of what was found.
127
+ """
128
+ from src.agents.browser import driver
129
+
130
+ result = ""
131
+
132
+ # Check for tables first
133
+ tables = driver.find_elements(By.TAG_NAME, "table")
134
+ if tables:
135
+ result += f"Found {len(tables)} table(s) that might contain leaderboard data.\n"
136
+
137
+ # Check for ordered lists
138
+ ol_elements = driver.find_elements(By.TAG_NAME, "ol")
139
+ if ol_elements:
140
+ result += f"Found {len(ol_elements)} ordered list(s) that might contain rankings.\n"
141
+
142
+ # Check for div elements with grid or flex display that might be custom leaderboards
143
+ grid_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'grid') or contains(@class, 'flex') or contains(@class, 'table') or contains(@class, 'rank') or contains(@class, 'leaderboard')]")
144
+ if grid_elements:
145
+ result += f"Found {len(grid_elements)} div elements with grid/flex/table classes that might be custom leaderboards.\n"
146
+
147
+ # Look for elements with rank or position indicators
148
+ rank_elements = driver.find_elements(By.XPATH, "//*[contains(@class, 'rank') or contains(@class, 'position') or contains(@class, 'standing')]")
149
+ if rank_elements:
150
+ result += f"Found {len(rank_elements)} elements with rank/position classes.\n"
151
+
152
+ if not result:
153
+ return "Could not find any obvious leaderboard elements. Try scrolling or navigating to the correct section."
154
+
155
+ return result
156
+
157
+ @tool
158
+ def map_clickable_elements(keyword: str = None) -> str:
159
+ """
160
+ Displays a list of all clickable elements on the page with their coordinates.
161
+
162
+ Args:
163
+ keyword: Optional keyword to filter elements. If specified, only elements containing this keyword will be displayed.
164
+
165
+ Returns:
166
+ A string listing all clickable elements with their coordinates.
167
+ """
168
+ from src.agents.browser import driver
169
+
170
+ clickable_selectors = [
171
+ "a", "button", "input[type='button']", "input[type='submit']",
172
+ ".clickable", "[role='button']", "[onclick]"
173
+ ]
174
+
175
+ result = "Éléments cliquables détectés:\n"
176
+ total = 0
177
+
178
+ for selector in clickable_selectors:
179
+ elements = driver.find_elements(By.CSS_SELECTOR, selector)
180
+ for i, element in enumerate(elements):
181
+ try:
182
+ text = element.text.strip()
183
+ if not text and element.get_attribute("value"):
184
+ text = element.get_attribute("value")
185
+
186
+ # Ignorer les éléments vides ou non visibles
187
+ if not text or not element.is_displayed():
188
+ continue
189
+
190
+ # Filtrer par mot-clé si spécifié
191
+ if keyword and keyword.lower() not in text.lower():
192
+ continue
193
+
194
+ rect = element.rect
195
+ x = int(rect['x'] + rect['width']/2)
196
+ y = int(rect['y'] + rect['height']/2)
197
+
198
+ result += f"{total+1}. '{text}' ({selector}) - coords: x={x}, y={y}\n"
199
+ total += 1
200
+ except:
201
+ continue
202
+
203
+ result += f"\nTotal: {total} éléments cliquables" + (" contenant '" + keyword + "'" if keyword else "")
204
+ return result
205
+
206
+ @tool
207
+ def copy_link_from_element(text_to_find: str, link_position: int = 1) -> str:
208
+ """
209
+ Find elements with specified text and return the URL if it's a link or has a parent link.
210
+ Args:
211
+ text_to_find: Text to search for
212
+ link_position: If multiple matches, which one to use (1-based)
213
+ """
214
+ from src.agents.browser import driver
215
+
216
+ try:
217
+ # Try to find an element with the given text
218
+ element = driver.find_element_by_xpath(f"//*[contains(text(), '{text_to_find}')]")
219
+ if not element:
220
+ return f"No element containing the text '{text_to_find}' was found."
221
+
222
+ # Try to find URL directly from the element
223
+ href = element.get_attribute("href")
224
+ if href:
225
+ return f"URL found: {href}"
226
+
227
+ # Try to find a parent that is a link
228
+ parent = element.find_element_by_xpath("./ancestor::a")
229
+ if parent:
230
+ href = parent.get_attribute("href")
231
+ if href:
232
+ return f"URL found in parent element: {href}"
233
+
234
+ # Try to find a child that is a link
235
+ child = element.find_element_by_xpath(".//a")
236
+ if child:
237
+ href = child.get_attribute("href")
238
+ if href:
239
+ return f"URL found in child element: {href}"
240
+
241
+ # Méthode 4: Essayer le clic droit et "Copier l'adresse du lien"
242
+ actions = ActionChains(driver)
243
+ actions.context_click(element).perform()
244
+
245
+ # Attendre un peu pour que le menu contextuel s'affiche
246
+ import time
247
+ time.sleep(1)
248
+
249
+ # Essayer de trouver et cliquer sur "Copier l'adresse du lien" ou équivalent
250
+ # Note: Cette partie est très dépendante du navigateur et de la langue
251
+ copy_link_texts = ["Copy link address", "Copier l'adresse du lien", "Copy Link", "Copier le lien"]
252
+
253
+ for text in copy_link_texts:
254
+ try:
255
+ link_option = driver.find_element(By.XPATH, f"//div[contains(text(), '{text}')]")
256
+ link_option.click()
257
+ return f"Action 'Copier l'adresse du lien' effectuée pour '{text_to_find}'"
258
+ except:
259
+ continue
260
+
261
+ # Annuler le menu contextuel
262
+ webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
263
+
264
+ return f"Impossible de trouver un lien pour l'élément '{text_to_find}' avec les méthodes disponibles."
265
+
266
+ except Exception as e:
267
+ return f"Erreur lors de la recherche du lien: {str(e)}"
268
+
269
+ @tool
270
+ def validate_json_results(result: dict) -> tuple[bool, str]:
271
+ """
272
+ Checks that the results do not contain generic placeholders.
273
+ Args:
274
+ result: The result to validate
275
+ Returns:
276
+ A tuple containing a boolean indicating if the result is valid and a message
277
+ explaining why the result is invalid if it is not valid.
278
+ """
279
+ if not result or not isinstance(result, dict):
280
+ return False, "Invalid result"
281
+
282
+ if "top_models" not in result or len(result.get("top_models", [])) < 3:
283
+ return False, "Less than 3 models found"
284
+
285
+ # Check for duplicate models
286
+ seen_models = set()
287
+ for model in result.get("top_models", []):
288
+ model_name = model.get("name", "").lower()
289
+ if model_name in seen_models:
290
+ return False, f"Duplicate model '{model.get('name')}' found. Please ensure each model is unique."
291
+ seen_models.add(model_name)
292
+
293
+ # Check for generic names
294
+ generic_names = ["model a", "model b", "model c", "model 1", "model 2", "model 3", "model name", "unavailable"]
295
+ model_names = [m.get("name", "").lower() for m in result.get("top_models", [])]
296
+ if any(name in generic_names for name in model_names):
297
+ return False, "Generic model names detected"
298
+
299
+ # Check for unwanted suffixes in model names
300
+ unwanted_suffix_pattern = r"\(.*\)$"
301
+ for model in result.get("top_models", []):
302
+ if re.search(unwanted_suffix_pattern, model.get("name", "")):
303
+ return False, f"Model name '{model.get('name')}' contains unwanted suffixes. Please remove them if you think they are not part of the model name. If it's a version number or a date, keep it."
304
+
305
+ # Check for generic URLs
306
+ generic_urls = ["example.com", "example.org"]
307
+ model_urls = [m.get("url", "").lower() for m in result.get("top_models", []) if m.get("url") is not None]
308
+ if any(generic in url for url in model_urls for generic in generic_urls):
309
+ return False, "Generic URLs detected"
310
+
311
+ # Check for submatch between model name and URL
312
+ for model in result.get("top_models", []):
313
+ name = model.get("name", "").lower()
314
+ url = model.get("url")
315
+
316
+ # Skip validation if URL is None or empty - this is acceptable, so no warning
317
+ if not url:
318
+ continue
319
+
320
+ url = url.lower()
321
+ if url and not any(name[i:i+4] in url for i in range(len(name) - 3)):
322
+ return False, f"URL for model '{model.get('name')}' does not have a valid submatch with the name. This is probably a wrong URL. Please check the URL and try again."
323
+
324
+ # Check the evaluation criterion
325
+ if "evaluation_criteria" not in result or len(result.get("evaluation_criteria", "")) < 10:
326
+ return False, "Evaluation criterion missing or too short"
327
+
328
+ return True, "Valid results"
329
+
330
+ @tool
331
+ def find_model_links(model_name: str) -> str:
332
+ """
333
+ Search for links that might point to a model based on their URL
334
+ and their match with the model name.
335
+ Args:
336
+ model_name: The name of the model to search for
337
+
338
+ Returns:
339
+ A list of potential links to the model
340
+ """
341
+ from src.agents.browser import driver
342
+ try:
343
+ # 1. Retrieve all links on the page
344
+ all_links = driver.find_elements(By.TAG_NAME, "a")
345
+ if not all_links:
346
+ return "No links were found on the page."
347
+
348
+ # 2. Known patterns for model repositories
349
+ model_url_patterns = [
350
+ r'huggingface\.co/[^/]+/[^/]+', # Hugging Face model repo
351
+ r'github\.com/[^/]+/[^/]+', # GitHub repo
352
+ ]
353
+
354
+ model_links = []
355
+ model_name_lower = model_name.lower()
356
+
357
+ for link in all_links:
358
+ try:
359
+ # Check if the link is visible and has an href attribute
360
+ if not link.is_displayed() or not link.get_attribute('href'):
361
+ continue
362
+
363
+ link_url = link.get_attribute('href')
364
+ link_text = link.text.strip()
365
+
366
+ # Ignore links to non-relevant resources
367
+ if link_url.endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.ico', '.css', '.js')):
368
+ continue
369
+
370
+ # Check if the URL matches a known pattern
371
+ matches_pattern = any(re.search(pattern, link_url, re.IGNORECASE) for pattern in model_url_patterns)
372
+
373
+ if matches_pattern:
374
+ # Check for a 3-character submatch between the model name and the URL
375
+ url_lower = link_url.lower()
376
+ has_submatch = False
377
+
378
+ # Search for a 3-character submatch in the model name
379
+ for i in range(len(model_name_lower) - 4):
380
+ if model_name_lower[i:i+5] in url_lower and model_name_lower[i:i+5] in link_text.lower():
381
+ has_submatch = True
382
+ break
383
+
384
+ if has_submatch:
385
+ # Calculate the confidence based on character matches
386
+ confidence = sum(1 for c in model_name_lower if c in link_text.lower())
387
+ model_links.append({
388
+ 'url': link_url,
389
+ 'text': link_text,
390
+ 'confidence': confidence
391
+ })
392
+ except Exception as e:
393
+ continue # Ignore errors and continue
394
+
395
+ # 3. Format the result
396
+ if not model_links:
397
+ return f"No potential links to the model '{model_name}' were found."
398
+
399
+ result = f"Found {len(model_links)} potential links for the model '{model_name}':\n\n"
400
+
401
+ for i, link in enumerate(model_links):
402
+ result += f"Candidate {i+1}:\n"
403
+ result += f"URL: {link['url']}\n"
404
+ result += f"Text: {link['text']}\n"
405
+ result += f"Confidence: {link['confidence']}\n\n"
406
+
407
+ # 4. Suggest the best candidate (the one with the highest confidence)
408
+ if model_links:
409
+ best_candidate = max(model_links, key=lambda x: x['confidence'])
410
+ result += f"Best candidate for '{model_name}':\nURL: {best_candidate['url']}\nText: {best_candidate['text']} "
411
+
412
+ return result
413
+ except Exception as e:
414
+ return f"Error while searching for links for the model '{model_name}': {str(e)}"
415
+
416
+ @tool
417
+ def click_at_coordinates(x: int, y: int) -> str:
418
+ """
419
+ Clicks at the specified x,y coordinates on the page.
420
+ This is useful when other targeting methods fail or when dealing with complex UI elements.
421
+
422
+ Args:
423
+ x: The x-coordinate to click at
424
+ y: The y-coordinate to click at
425
+
426
+ Returns:
427
+ A message confirming the click action
428
+ """
429
+ from src.agents.browser import driver
430
+
431
+ try:
432
+ # Using ActionChains for precise coordinate clicks
433
+ actions = ActionChains(driver)
434
+ actions.move_by_offset(x, y).click().perform()
435
+ actions.reset_actions() # Reset position after click
436
+
437
+ # Alternative approach using Helium
438
+ # helium.click_at_point(x, y)
439
+
440
+ time.sleep(1) # Wait a moment for any reactions to the click
441
+ return f"Successfully clicked at coordinates ({x}, {y})"
442
+ except Exception as e:
443
+ return f"Failed to click at coordinates ({x}, {y}): {str(e)}"
src/file_utils.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities for file management.
3
+ """
4
+ import json
5
+ import os
6
+ import datetime
7
+ import shutil
8
+ import time
9
+ import random
10
+ import tempfile
11
+ import logging
12
+ from filelock import FileLock
13
+
14
+ logger = logging.getLogger("leaderboard-parser")
15
+
16
+ def save_results(results, file_path):
17
+ """
18
+ Save results to a JSON file.
19
+
20
+ Args:
21
+ results: The results to save
22
+ file_path: The path to the file
23
+ """
24
+ with open(file_path, "w") as f:
25
+ json.dump(results, f, indent=2)
26
+
27
+
28
+ def create_category_slug(category_name):
29
+ """
30
+ Creates a slug from a category name.
31
+ The slug uses only hyphens as separators (no underscore).
32
+
33
+ Args:
34
+ category_name: The category name
35
+
36
+ Returns:
37
+ The category slug
38
+ """
39
+ if not category_name:
40
+ return ""
41
+ # Convert to lowercase and replace spaces with hyphens
42
+ # Ensure no underscores are used in the category slug
43
+ return category_name.lower().replace(" ", "-").replace("_", "-")
44
+
45
+
46
+ def create_combined_id(category, uid):
47
+ """
48
+ Creates a normalized combined identifier from a category and UID.
49
+ First normalizes the category using create_category_slug.
50
+
51
+ Args:
52
+ category: The category name
53
+ uid: The UID of the leaderboard
54
+
55
+ Returns:
56
+ The combined identifier in the format category_slug_uid
57
+ """
58
+ normalized_category = create_category_slug(category)
59
+ return f"{normalized_category}_{uid}"
60
+
61
+
62
+ def validate_leaderboard_result(result):
63
+ """
64
+ Validates and corrects if necessary a leaderboard result to ensure identifier consistency.
65
+
66
+ This function checks:
67
+ 1. That 'uid' is present and correctly formatted (category_original_uid)
68
+ 2. That 'original_uid' is present
69
+ 3. That 'category' is present and normalized
70
+ 4. That 'uid' corresponds to the combination of category and original_uid
71
+
72
+ Args:
73
+ result: The leaderboard result to validate (dict)
74
+
75
+ Returns:
76
+ The validated and corrected result, or None if validation is impossible
77
+ """
78
+ if not isinstance(result, dict):
79
+ logger.error(f"Validation error: the result is not a dictionary")
80
+ return None
81
+
82
+ # Check if required fields are present
83
+ if "original_uid" not in result:
84
+ logger.error(f"Validation error: original_uid missing from result")
85
+ return None
86
+
87
+ if "category" not in result:
88
+ logger.error(f"Validation error: category missing from result")
89
+ return None
90
+
91
+ original_uid = result["original_uid"]
92
+ category = result["category"]
93
+
94
+ # Normalize the category if necessary
95
+ normalized_category = create_category_slug(category)
96
+ if normalized_category != category:
97
+ logger.warning(f"Category not normalized: '{category}' -> '{normalized_category}'")
98
+ result["category"] = normalized_category
99
+
100
+ # Recalculate the correct combined uid
101
+ correct_uid = create_combined_id(normalized_category, original_uid)
102
+
103
+ # Check if existing uid is correct
104
+ if "uid" not in result:
105
+ logger.warning(f"uid missing, adding calculated uid: {correct_uid}")
106
+ result["uid"] = correct_uid
107
+ elif result["uid"] != correct_uid:
108
+ logger.warning(f"uid inconsistent: '{result['uid']}' does not match '{correct_uid}', correction applied")
109
+ result["uid"] = correct_uid
110
+
111
+ return result
112
+
113
+
114
+ def load_and_validate_results(file_path):
115
+ """
116
+ Loads results from the file without strict validation.
117
+
118
+ Args:
119
+ file_path: Path to the results file
120
+
121
+ Returns:
122
+ List of results, or empty list in case of error
123
+ """
124
+ try:
125
+ # Load results from the file
126
+ try:
127
+ with open(file_path, "r", encoding="utf-8") as f:
128
+ results_data = json.load(f)
129
+ except (FileNotFoundError, json.JSONDecodeError) as e:
130
+ logger.warning(f"Unable to load file {file_path}: {str(e)}")
131
+ return []
132
+
133
+ # Convert from dict with "leaderboards" to array if necessary
134
+ if isinstance(results_data, dict) and "leaderboards" in results_data:
135
+ array_results = []
136
+ for uid, item in results_data["leaderboards"].items():
137
+ item_copy = item.copy()
138
+ item_copy["uid"] = uid
139
+ array_results.append(item_copy)
140
+ results_data = array_results
141
+
142
+ # Ensure results_data is a list
143
+ if not isinstance(results_data, list):
144
+ logger.warning(f"Invalid data format in {file_path}, initializing empty list")
145
+ return []
146
+
147
+ # Sort results
148
+ results_data.sort(key=lambda x: (x.get("category", ""), x.get("original_uid", "")))
149
+
150
+ logger.info(f"Load successful: {len(results_data)} results")
151
+ return results_data
152
+
153
+ except Exception as e:
154
+ logger.error(f"Error loading results: {str(e)}")
155
+ return []
156
+
157
+
158
+ def update_leaderboard_result(leaderboard_result, file_path, max_wait_seconds=30):
159
+ """
160
+ Updates a leaderboard result in the specified file.
161
+ If an entry with the same uid already exists, it is updated.
162
+ Otherwise, a new entry is added.
163
+
164
+ Args:
165
+ leaderboard_result: The leaderboard result to update (must contain a uid)
166
+ file_path: Path to the results file
167
+ max_wait_seconds: Maximum wait time for file lock (in seconds)
168
+
169
+ Returns:
170
+ Updated results list or None in case of error
171
+ """
172
+ if not leaderboard_result or "uid" not in leaderboard_result:
173
+ logger.error("Unable to update: invalid or missing leaderboard result or uid")
174
+ return None
175
+
176
+ # Create parent directory if necessary
177
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
178
+
179
+ # Use a lock to avoid concurrent writes
180
+ lock_path = f"{file_path}.lock"
181
+ lock = FileLock(lock_path, timeout=max_wait_seconds)
182
+
183
+ try:
184
+ with lock:
185
+ # Load existing results
186
+ current_results = load_and_validate_results(file_path)
187
+
188
+ # Index by uid for easy update
189
+ results_by_uid = {r.get("uid", ""): r for r in current_results if "uid" in r}
190
+
191
+ # Update or add result
192
+ uid = leaderboard_result["uid"]
193
+ if uid in results_by_uid:
194
+ # Update existing result
195
+ results_by_uid[uid].update(leaderboard_result)
196
+ logger.info(f"Result updated for uid: {uid}")
197
+ else:
198
+ # Add new result
199
+ results_by_uid[uid] = leaderboard_result
200
+ logger.info(f"New result added for uid: {uid}")
201
+
202
+ # Convert to list for writing
203
+ updated_results = list(results_by_uid.values())
204
+
205
+ # Sort results
206
+ updated_results.sort(key=lambda x: (x.get("category", ""), x.get("original_uid", "")))
207
+
208
+ # Write to temporary file then rename for atomicity
209
+ fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(file_path))
210
+ try:
211
+ with os.fdopen(fd, 'w', encoding='utf-8') as f:
212
+ json.dump(updated_results, f, indent=2, ensure_ascii=False)
213
+
214
+ # Replace original file with temporary file
215
+ shutil.move(temp_path, file_path)
216
+ logger.info(f"File updated successfully: {file_path}")
217
+
218
+ return updated_results
219
+ except Exception as e:
220
+ # Clean up in case of error
221
+ if os.path.exists(temp_path):
222
+ os.unlink(temp_path)
223
+ raise e
224
+
225
+ except Exception as e:
226
+ logger.error(f"Error updating file {file_path}: {str(e)}")
227
+ return None
228
+
229
+
230
+ def split_combined_id(combined_id):
231
+ """
232
+ Splits a combined identifier (category_uid) into its components.
233
+ Uses only the first underscore "_" as separator.
234
+
235
+ Args:
236
+ combined_id: The combined identifier (category_uid)
237
+
238
+ Returns:
239
+ A tuple (category, uid) or (None, combined_id) if no underscore
240
+ """
241
+ if not combined_id:
242
+ return None, None
243
+
244
+ # Search for the first underscore to separate category and uid
245
+ parts = combined_id.split("_", 1)
246
+ if len(parts) == 2:
247
+ return parts[0], parts[1]
248
+ else:
249
+ # If no underscore, consider it as just a uid without category
250
+ return None, combined_id
251
+
252
+
253
+ def format_datetime(dt_str):
254
+ """
255
+ Format a datetime string to a human readable format.
256
+
257
+ Args:
258
+ dt_str: The datetime string to format
259
+
260
+ Returns:
261
+ A formatted datetime string
262
+ """
263
+ try:
264
+ # Check if input is already a datetime object
265
+ if isinstance(dt_str, datetime.datetime):
266
+ dt = dt_str
267
+ else:
268
+ # Convert ISO format to datetime object
269
+ # Handle different formats of ISO dates including fractional seconds and timezone
270
+ try:
271
+ dt = datetime.datetime.fromisoformat(dt_str)
272
+ except ValueError:
273
+ # Handle other common formats
274
+ formats = [
275
+ "%Y-%m-%dT%H:%M:%S.%f%z",
276
+ "%Y-%m-%dT%H:%M:%S.%f",
277
+ "%Y-%m-%dT%H:%M:%S%z",
278
+ "%Y-%m-%dT%H:%M:%S",
279
+ "%Y-%m-%d %H:%M:%S",
280
+ "%Y-%m-%d"
281
+ ]
282
+
283
+ for fmt in formats:
284
+ try:
285
+ dt = datetime.datetime.strptime(dt_str, fmt)
286
+ break
287
+ except ValueError:
288
+ continue
289
+ else:
290
+ # If no format matches
291
+ return dt_str
292
+
293
+ # Format the datetime object
294
+ return dt.strftime("%d/%m/%Y à %H:%M:%S")
295
+ except (ValueError, TypeError) as e:
296
+ print(f"Error formatting date {dt_str}: {e}")
297
+ return dt_str
298
+
299
+
300
+ def clean_output_files(results_file):
301
+ """
302
+ Clean the output files, but keep a backup of the original.
303
+
304
+ Args:
305
+ results_file: The results file to clean
306
+ """
307
+ # If results file exists, make a backup
308
+ if os.path.exists(results_file):
309
+ backup_file = f"{results_file}.backup"
310
+ shutil.copy2(results_file, backup_file)
311
+ print(f"Backup of {results_file} created in {backup_file}")
312
+
313
+ # Create an empty results file
314
+ with open(results_file, "w") as f:
315
+ json.dump([], f, indent=2)
316
+ print(f"File {results_file} cleaned")
src/hub_utils.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities for interacting with the Hugging Face Hub.
3
+ """
4
+ import os
5
+ from huggingface_hub import HfApi, login, hf_hub_download
6
+
7
+
8
+ def upload_to_hub(to_parse_file, results_file, repo_id=None):
9
+ """
10
+ Uploads files to the Hugging Face Hub.
11
+
12
+ Args:
13
+ to_parse_file: Path to the categories file
14
+ results_file: Path to the results file
15
+ repo_id: Hub repository ID
16
+
17
+ Returns:
18
+ True if upload succeeded, False otherwise
19
+ """
20
+ try:
21
+ # Use environment variable HUGGING_FACE_STORAGE_REPO if available
22
+ # Otherwise, use default value
23
+ if repo_id is None:
24
+ repo_id = os.getenv("HUGGING_FACE_STORAGE_REPO", "leaderboard-explorer/leaderboard_explorer")
25
+ if os.getenv("HUGGING_FACE_STORAGE_REPO"):
26
+ print(f"Using target dataset specified in HUGGING_FACE_STORAGE_REPO: {repo_id}")
27
+ else:
28
+ print(f"No target dataset specified, using default value: {repo_id}")
29
+
30
+ # Check if token is available
31
+ token = os.getenv("HUGGING_FACE_HUB_TOKEN")
32
+ if not token:
33
+ print("ERROR: Environment variable HUGGING_FACE_HUB_TOKEN is not defined.")
34
+ return False
35
+
36
+ # Connect to Hub
37
+ print("Connecting to Hugging Face Hub...")
38
+ login(token=token)
39
+ api = HfApi()
40
+
41
+ # Upload JSON files
42
+ print(f"\n--- UPLOADING CATEGORIES FILE ---")
43
+ print(f"Local file: {to_parse_file}")
44
+ print(f"Destination: {repo_id}/best_model_for_category_list.json")
45
+ print(f"Uploading...")
46
+
47
+ try:
48
+ api.upload_file(
49
+ path_or_fileobj=to_parse_file,
50
+ path_in_repo="best_model_for_category_list.json",
51
+ repo_id=repo_id,
52
+ repo_type="dataset",
53
+ commit_message="Update leaderboard categories"
54
+ )
55
+ print(f"Upload of {to_parse_file} successful!")
56
+ except Exception as e:
57
+ print(f"Note when uploading {to_parse_file}: {e}")
58
+ if "No files have been modified since last commit" in str(e):
59
+ print("→ The categories file is identical to the one already on the Hub. No changes needed.")
60
+ else:
61
+ print(f"→ ERROR: Upload failed for another reason.")
62
+ raise e
63
+
64
+ print(f"\n--- UPLOADING RESULTS FILE ---")
65
+ print(f"Local file: {results_file}")
66
+ print(f"Destination: {repo_id}/best_model_for_results.json")
67
+ print(f"Uploading...")
68
+
69
+ try:
70
+ api.upload_file(
71
+ path_or_fileobj=results_file,
72
+ path_in_repo="best_model_for_results.json",
73
+ repo_id=repo_id,
74
+ repo_type="dataset",
75
+ commit_message="Update leaderboard results"
76
+ )
77
+ print(f"Upload of {results_file} successful!")
78
+ except Exception as e:
79
+ print(f"Note when uploading {results_file}: {e}")
80
+ if "No files have been modified since last commit" in str(e):
81
+ print("→ The results file is identical to the one already on the Hub. No changes needed.")
82
+ else:
83
+ print(f"→ ERROR: Upload failed for another reason.")
84
+ raise e
85
+
86
+ print(f"\nUpload operation completed: files have been processed!")
87
+ return True
88
+ except Exception as e:
89
+ print(f"GENERAL ERROR during file upload to Hub: {e}")
90
+ return False
91
+
92
+
93
+ def download_from_hub(repo_id=None):
94
+ """
95
+ Downloads files from the Hugging Face Hub.
96
+
97
+ Args:
98
+ repo_id: Hub repository ID
99
+
100
+ Returns:
101
+ True if download succeeded, False otherwise
102
+ """
103
+ try:
104
+ # Use environment variable HUGGING_FACE_STORAGE_REPO if available
105
+ # Otherwise, use default value
106
+ if repo_id is None:
107
+ repo_id = os.getenv("HUGGING_FACE_STORAGE_REPO", "leaderboard-explorer/leaderboard_explorer")
108
+ if os.getenv("HUGGING_FACE_STORAGE_REPO"):
109
+ print(f"Using source dataset specified in HUGGING_FACE_STORAGE_REPO: {repo_id}")
110
+ else:
111
+ print(f"No source dataset specified, using default value: {repo_id}")
112
+
113
+ # Check if token is available
114
+ token = os.getenv("HUGGING_FACE_HUB_TOKEN")
115
+ if not token:
116
+ print("ERROR: Environment variable HUGGING_FACE_HUB_TOKEN is not defined.")
117
+ return False
118
+
119
+ # Connect to Hub
120
+ login(token=token)
121
+
122
+ # Create data directory if it doesn't exist
123
+ script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
124
+ data_dir = os.path.join(script_dir, "data")
125
+ os.makedirs(data_dir, exist_ok=True)
126
+
127
+ # List of required and optional files
128
+ required_files = [
129
+ "final_leaderboards.json",
130
+ "best_model_for_category_list.json"
131
+ ]
132
+
133
+ optional_files = [
134
+ "best_model_for_results.json"
135
+ ]
136
+
137
+ # Download required files first
138
+ for filename in required_files:
139
+ local_path = os.path.join(data_dir, filename)
140
+ try:
141
+ # Download file
142
+ print(f"Downloading {filename} from {repo_id}...")
143
+ hf_hub_download(
144
+ repo_id=repo_id,
145
+ filename=filename,
146
+ repo_type="dataset",
147
+ local_dir=data_dir,
148
+ local_dir_use_symlinks=False
149
+ )
150
+ print(f"File {filename} successfully downloaded to {local_path}")
151
+ except Exception as e:
152
+ print(f"ERROR: Unable to download required file {filename}: {e}")
153
+ return False
154
+
155
+ # Download optional files next
156
+ for filename in optional_files:
157
+ local_path = os.path.join(data_dir, filename)
158
+ try:
159
+ print(f"Downloading {filename} from {repo_id}...")
160
+ hf_hub_download(
161
+ repo_id=repo_id,
162
+ filename=filename,
163
+ repo_type="dataset",
164
+ local_dir=data_dir,
165
+ local_dir_use_symlinks=False
166
+ )
167
+ print(f"File {filename} successfully downloaded to {local_path}")
168
+ except Exception as e:
169
+ print(f"WARNING: Unable to download optional file {filename}: {e}")
170
+ print(f"This is not a problem, a new file will be created if necessary.")
171
+
172
+ return True
173
+ except Exception as e:
174
+ print(f"ERROR during file download from Hub: {e}")
175
+ return False
src/leaderboard_processor.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Leaderboard processing.
3
+ """
4
+ import datetime
5
+ import os
6
+ from src.agents.parser.parser_agent import process_leaderboard
7
+ from src.file_utils import create_category_slug, split_combined_id
8
+
9
+
10
+ def normalize_category(category_name):
11
+ """
12
+ Normalizes a category name by replacing spaces and underscores with hyphens and converting to lowercase.
13
+
14
+ Args:
15
+ category_name: The category name to normalize
16
+
17
+ Returns:
18
+ The normalized category
19
+ """
20
+ # Use the create_category_slug function from file_utils.py
21
+ return create_category_slug(category_name)
22
+
23
+
24
+ def process_single_leaderboard(uid, host, model, index, all_results, additional_rules=None, category=None):
25
+ """
26
+ Process a single leaderboard and update the results.
27
+
28
+ Args:
29
+ uid: The UID of the leaderboard to process
30
+ host: The URL of the leaderboard
31
+ model: The model to use
32
+ index: The index of the leaderboard
33
+ all_results: The list of all results
34
+ additional_rules: Additional specific rules for this leaderboard
35
+ category: The category of the leaderboard (for combined identifier)
36
+
37
+ Returns:
38
+ The updated list of results
39
+ """
40
+ print(f"\n\nProcessing leaderboard: {uid} - {host}")
41
+ if additional_rules:
42
+ print(f"Additional rules for this leaderboard: {additional_rules}")
43
+ if category:
44
+ normalized_category = normalize_category(category)
45
+ print(f"Category: {category} (normalized: {normalized_category})")
46
+ else:
47
+ normalized_category = None
48
+
49
+ # Get the maximum number of retries from environment variables
50
+ max_retries = int(os.getenv("LEADERBOARD_MAX_RETRIES", "3"))
51
+ print(f"Maximum number of retries configured: {max_retries}")
52
+
53
+ attempt = 0
54
+ last_error = None
55
+
56
+ # Try to process the leaderboard multiple times
57
+ while attempt < max_retries:
58
+ attempt += 1
59
+ if attempt > 1:
60
+ print(f"Retry attempt {attempt}/{max_retries} for leaderboard {uid} - {host}")
61
+
62
+ # Process the leaderboard
63
+ result = process_leaderboard(host, model, index, uid, additional_rules)
64
+
65
+ # If the parsing was successful or we've reached the maximum number of retries
66
+ if result.get("parsing_status") == "success" or attempt >= max_retries:
67
+ break
68
+
69
+ # If there was an error, save it for later
70
+ if result.get("parsing_status") == "error":
71
+ last_error = result.get("parsing_message", "Unknown error")
72
+ print(f"Error during attempt {attempt}: {last_error}")
73
+
74
+ # Get parsing date from result or generate a new one if not available
75
+ if result and "parsed_at" in result:
76
+ parsed_at = result["parsed_at"]
77
+ else:
78
+ # Fallback to current time if not provided by process_leaderboard
79
+ now = datetime.datetime.now()
80
+ parsed_at = now.isoformat()
81
+
82
+ # Create combined ID if category is provided
83
+ result_uid = uid
84
+ if normalized_category:
85
+ # Format of the combined UID: category_uid
86
+ # The category is already normalized (slugified) by normalize_category
87
+ # The underscore "_" is the ONLY separator between the category and the UID
88
+ result_uid = f"{normalized_category}_{uid}"
89
+
90
+ # Create base result object with uid, host, and thumbnail
91
+ leaderboard_result = {
92
+ "uid": result_uid,
93
+ "original_uid": uid,
94
+ "category": normalized_category,
95
+ "host": host,
96
+ "parsing_status": "rejected", # Default to rejected
97
+ "parsed_at": parsed_at
98
+ }
99
+
100
+ # Check if we have valid results
101
+ valid_result = False
102
+ if result and result.get("results"):
103
+ if isinstance(result["results"], dict):
104
+ # Check if we have top models with required fields
105
+ if "top_models" in result["results"] and len(result["results"]["top_models"]) > 0:
106
+ valid_models = True
107
+ for model_info in result["results"]["top_models"]:
108
+ # Each model must have at least rank and name
109
+ if not model_info.get("rank") or not model_info.get("name"):
110
+ valid_models = False
111
+ break
112
+
113
+ # Check if we have evaluation criteria
114
+ if valid_models and "evaluation_criteria" in result["results"] and result["results"]["evaluation_criteria"]:
115
+ valid_result = True
116
+ else:
117
+ print(f"Invalid results format: {type(result['results']).__name__}, expected dict")
118
+ else:
119
+ print(f"Missing or empty results in agent response")
120
+
121
+ # If we have valid results, extract the data
122
+ if valid_result:
123
+ leaderboard_result["parsing_status"] = "approved"
124
+ leaderboard_result["top_models"] = []
125
+ leaderboard_result["evaluation_criteria"] = result["results"]["evaluation_criteria"]
126
+
127
+ # Extract top models
128
+ for model_info in result["results"]["top_models"]:
129
+ model_entry = {
130
+ "rank": model_info.get("rank"),
131
+ "name": model_info.get("name"),
132
+ "url": model_info.get("url", None)
133
+ }
134
+ leaderboard_result["top_models"].append(model_entry)
135
+ else:
136
+ print(f"Leaderboard rejected: {uid} - Incomplete or invalid information")
137
+
138
+ # Check if this UID already exists in the results
139
+ for i, existing_result in enumerate(all_results):
140
+ if existing_result["uid"] == result_uid:
141
+ # Replace the existing result
142
+ all_results[i] = leaderboard_result
143
+ print(f"Result updated for UID: {result_uid}")
144
+ return all_results
145
+
146
+ # ADDITIONAL CHECK: Make sure there's no confusion with other categories
147
+ # for the same original_uid
148
+ for existing_result in all_results:
149
+ if existing_result["original_uid"] == uid and existing_result["category"] != normalized_category:
150
+ print(f"WARNING: A result already exists for original_uid {uid} but with a different category:")
151
+ print(f" - Existing category: {existing_result['category']}, UID: {existing_result['uid']}")
152
+ print(f" - New category: {normalized_category}, UID: {result_uid}")
153
+ # We continue anyway, as it's a valid case to have the same leaderboard in different categories
154
+
155
+ # If we get here, this is a new result
156
+ all_results.append(leaderboard_result)
157
+ print(f"New result added for UID: {result_uid}")
158
+ return all_results
src/processor.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Leaderboard processing module for the leaderboard parser.
3
+ This module contains the main functions for processing leaderboards.
4
+ """
5
+ import json
6
+ import os
7
+ import datetime
8
+ import logging
9
+ import time
10
+ import argparse
11
+ from typing import Dict, Any, List, Tuple, Optional
12
+
13
+ # Import functions from other modules
14
+ from src.file_utils import save_results, format_datetime, clean_output_files, update_leaderboard_result
15
+ from src.file_utils import create_category_slug, split_combined_id, create_combined_id
16
+ from src.file_utils import load_and_validate_results, validate_leaderboard_result
17
+ from src.hub_utils import upload_to_hub, download_from_hub
18
+ from src.leaderboard_processor import process_single_leaderboard
19
+ from src.agents.parser.parser_agent import get_default_model
20
+ from src.agents.browser import cleanup_browser
21
+
22
+ # Configure logger
23
+ logger = logging.getLogger("leaderboard-parser")
24
+
25
+ # Update state variables in server module
26
+ def update_server_status(status, error=None):
27
+ """
28
+ Updates the server status.
29
+
30
+ Args:
31
+ status: The new status ('idle', 'running', 'completed', 'failed')
32
+ error: The error message in case of failure
33
+ """
34
+ try:
35
+ from src.server import processing_status, processing_error
36
+
37
+ # Update global variables in server.py
38
+ globals()['processing_status'] = status
39
+ globals()['processing_error'] = error
40
+
41
+ # Update server module variables
42
+ import src.server
43
+ src.server.processing_status = status
44
+ src.server.processing_error = error
45
+ except ImportError:
46
+ # In non-server mode, these variables don't exist
47
+ pass
48
+
49
+ def process_leaderboards(args_dict=None) -> Tuple[bool, str]:
50
+ """
51
+ Process leaderboards with the given arguments.
52
+ Returns a tuple of (success, message)
53
+ """
54
+ # Update status
55
+ update_server_status("running")
56
+
57
+ # Set default arguments if none provided
58
+ if args_dict is None:
59
+ args_dict = {"local_only": False}
60
+
61
+ # Create an argparse.Namespace object from the dictionary
62
+ args = argparse.Namespace(**args_dict)
63
+
64
+ try:
65
+ # Ensure we're in the correct directory
66
+ script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
67
+ os.chdir(script_dir)
68
+
69
+ # Verify that the HF token is set
70
+ if not os.environ.get("HUGGING_FACE_HUB_TOKEN") and not args.local_only:
71
+ raise ValueError("HUGGING_FACE_HUB_TOKEN environment variable is not set!")
72
+
73
+ # Use default paths for category list and leaderboards
74
+ category_list_file = "data/best_model_for_category_list.json"
75
+ leaderboards_file = "data/final_leaderboards.json"
76
+ results_file = "data/best_model_for_results.json"
77
+
78
+ # Clean if requested
79
+ if getattr(args, "clean", False):
80
+ clean_output_files(results_file)
81
+
82
+ # Check if we're just uploading
83
+ if getattr(args, "upload_only", False):
84
+ upload_to_hub(to_parse_file=category_list_file, results_file=results_file)
85
+ update_server_status("completed")
86
+ return True, "Upload completed successfully"
87
+
88
+ # Download data from the Hub if not in local-only mode
89
+ if not getattr(args, "local_only", False):
90
+ download_from_hub()
91
+
92
+ # Just before the line that generates the error
93
+ logger.info(f"Starting leaderboard processing")
94
+
95
+ # Load the category list and leaderboards data
96
+ try:
97
+ with open(category_list_file, "r", encoding="utf-8") as f:
98
+ category_list = json.load(f)
99
+
100
+ with open(leaderboards_file, "r", encoding="utf-8") as f:
101
+ leaderboards = json.load(f)
102
+
103
+ # Create a mapping UID -> HOST for all leaderboards
104
+ uid_to_host = {lb["uid"]: lb["host"] for lb in leaderboards if "uid" in lb and "host" in lb}
105
+ logger.info(f"Loaded {len(uid_to_host)} UID -> HOST mappings from {leaderboards_file}")
106
+ except FileNotFoundError as e:
107
+ update_server_status("failed", str(e))
108
+ return False, f"File not found: {e}"
109
+
110
+ # Load existing results if any
111
+ try:
112
+ logger.info(f"Loading and validating results from {results_file}")
113
+ results_data = load_and_validate_results(results_file)
114
+ all_results = results_data
115
+ logger.info(f"Loaded and validated {len(all_results)} existing results")
116
+ except Exception as e:
117
+ logger.warning(f"Error loading results: {str(e)}")
118
+ results_data = []
119
+ all_results = []
120
+
121
+ # Create a map of combined UIDs to their complete data (for checking parsing date)
122
+ processed_results_map = {}
123
+ for result in results_data:
124
+ if "uid" in result:
125
+ processed_results_map[result["uid"]] = result
126
+
127
+ # Get reprocessing interval from environment variable (in hours)
128
+ # Default value: 24 hours
129
+ reprocess_interval_hours = int(os.getenv("LEADERBOARD_REPROCESS_INTERVAL_HOURS", "24"))
130
+
131
+ # Maximum age without update (in seconds)
132
+ max_age_seconds = reprocess_interval_hours * 60 * 60
133
+ logger.info(f"Leaderboard reprocessing interval: {reprocess_interval_hours} hours")
134
+
135
+ # Current date and time
136
+ now = datetime.datetime.now()
137
+ print(f"Current system date: {now.isoformat()} - Readable format: {format_datetime(now.isoformat())}")
138
+
139
+ # Get the default agent
140
+ model = get_default_model()
141
+
142
+ # Collect all leaderboards to process
143
+ leaderboards_to_process = []
144
+ force_retry_leaderboards = []
145
+
146
+ # Add logs for debugging
147
+ logger.info(f"Available categories: {len(category_list)}")
148
+ logger.info(f"Available leaderboards: {len(uid_to_host)}")
149
+ logger.info(f"Sample of available UIDs: {list(uid_to_host.keys())[:5]}")
150
+
151
+ # Check if a specific category is requested
152
+ target_category = getattr(args, "force_retry_category", None)
153
+ target_uid = getattr(args, "force_retry_uid", None)
154
+
155
+ # Exclusive mode (only process specified leaderboards)
156
+ exclusive_mode = target_category is not None or target_uid is not None
157
+
158
+ if target_category:
159
+ logger.info(f"Force retry category mode enabled (exclusive): {target_category}")
160
+
161
+ if target_uid:
162
+ logger.info(f"Force retry UID mode enabled (exclusive): {target_uid}")
163
+
164
+ # Process leaderboards
165
+ for category in category_list:
166
+ category_name = category["category"]
167
+ normalized_category = create_category_slug(category_name)
168
+
169
+ # If in specific category mode and this is not the target category, skip to the next
170
+ if target_category and target_category != normalized_category:
171
+ logger.info(f"Category {category_name} (normalized: {normalized_category}) ignored - Does not match target category {target_category}")
172
+ continue
173
+
174
+ # ADDITIONAL SAFETY: Reload data from file before each new category
175
+ # This ensures there is no contamination between categories
176
+ try:
177
+ logger.info(f"Reloading data from file before processing category: {category_name}")
178
+ all_results = load_and_validate_results(results_file)
179
+ logger.info(f"Data reloaded successfully: {len(all_results)} results available")
180
+ except Exception as e:
181
+ logger.warning(f"Unable to reload data before category {category_name}: {str(e)}")
182
+ # In case of error, keep existing data if possible
183
+ if not isinstance(all_results, list):
184
+ all_results = []
185
+
186
+ # Check if category has leaderboards
187
+ if "leaderboards" not in category or not isinstance(category["leaderboards"], list):
188
+ logger.warning(f"Category '{category_name}' has no leaderboards or incorrect format.")
189
+ continue
190
+
191
+ # Process each leaderboard in the category
192
+ for leaderboard in category["leaderboards"]:
193
+ if "uid" not in leaderboard:
194
+ logger.warning(f"Leaderboard in category '{category_name}' has no UID.")
195
+ continue
196
+
197
+ leaderboard_uid = leaderboard["uid"]
198
+
199
+ # In specific UID mode, ignore all other leaderboards
200
+ if target_uid and target_uid != leaderboard_uid:
201
+ logger.info(f"Leaderboard {leaderboard_uid} ignored - Does not match target UID {target_uid}")
202
+ continue
203
+
204
+ # Get additional rules if available
205
+ additional_rules = leaderboard.get("additionnal_agent_rules", None)
206
+
207
+ # Check if we should force processing this leaderboard
208
+ # Using the new distinct options
209
+ force_retry_uid = getattr(args, "force_retry_uid", None) == leaderboard_uid
210
+ force_retry_category = getattr(args, "force_retry_category", None) == normalized_category
211
+
212
+ # Support for the old option for backward compatibility (to be removed later)
213
+ legacy_force_retry = False
214
+ if hasattr(args, "force_retry") and getattr(args, "force_retry", None) is not None:
215
+ legacy_force_retry = (
216
+ getattr(args, "force_retry", None) == leaderboard_uid or
217
+ getattr(args, "force_retry", None) == normalized_category
218
+ )
219
+ if legacy_force_retry:
220
+ logger.warning("The --force-retry option is obsolete. Use --force-retry-uid or --force-retry-category instead.")
221
+
222
+ # Combine different sources of force_retry
223
+ force_retry = force_retry_uid or force_retry_category or legacy_force_retry
224
+
225
+ # Add explicit logs about the reason for force retry
226
+ if force_retry:
227
+ if force_retry_uid:
228
+ logger.info(f"Force retry enabled for leaderboard UID: {leaderboard_uid}")
229
+ elif force_retry_category:
230
+ logger.info(f"Force retry enabled for all leaderboards in category: {normalized_category}")
231
+ elif legacy_force_retry:
232
+ logger.info(f"Force retry enabled via the old --force-retry option for: {getattr(args, 'force_retry', None)}")
233
+
234
+ # Search for the leaderboard URL in uid_to_host (direct dictionary lookup)
235
+ host = uid_to_host.get(leaderboard_uid)
236
+
237
+ if not host:
238
+ logger.warning(f"UID '{leaderboard_uid}' (category: {normalized_category}) not found in leaderboards.")
239
+ # Show more information for debugging
240
+ logger.debug(f"Total number of UIDs available: {len(uid_to_host)}")
241
+ continue
242
+
243
+ # Create combined identifier (category_uid)
244
+ # The category is already normalized by create_category_slug
245
+ combined_uid = create_combined_id(normalized_category, leaderboard_uid)
246
+
247
+ # If force_retry is enabled, process the leaderboard without checking the time since last processing
248
+ if force_retry:
249
+ logger.info(f"Force retry enabled for {combined_uid} - Processing forced independently of last processing date.")
250
+ leaderboards_to_process.append({
251
+ "uid": leaderboard_uid,
252
+ "host": host,
253
+ "category": normalized_category,
254
+ "additional_rules": additional_rules,
255
+ "force_retry": force_retry
256
+ })
257
+ continue # Skip directly to the next leaderboard
258
+
259
+ # Check if the leaderboard has already been processed recently
260
+ needs_reprocessing = True
261
+ if combined_uid in processed_results_map:
262
+ # Check if the leaderboard has been processed within the interval
263
+ result = processed_results_map[combined_uid]
264
+
265
+ # If the --retry-rejected option is active and the status is "rejected", force reprocessing
266
+ if getattr(args, "retry_rejected", False) and result.get("parsing_status") == "rejected":
267
+ logger.info(f"Leaderboard {combined_uid} previously rejected, forced reprocessing with --retry-rejected.")
268
+ elif "parsed_at" in result:
269
+ try:
270
+ # Convert parsing date to datetime object
271
+ parsed_at = datetime.datetime.fromisoformat(result["parsed_at"])
272
+
273
+ # Calculate time elapsed since last parsing
274
+ time_diff = now - parsed_at
275
+
276
+ # Add logs for debugging date checks
277
+ logger.info(f"DEBUG: Current date: {now.isoformat()}")
278
+ logger.info(f"DEBUG: Last parsing date: {parsed_at.isoformat()}")
279
+ logger.info(f"DEBUG: Time difference in seconds: {time_diff.total_seconds()}")
280
+ logger.info(f"DEBUG: Reprocessing threshold (seconds): {max_age_seconds}")
281
+
282
+ # Strictly check if the duration in seconds is greater than the threshold
283
+ time_seconds = time_diff.total_seconds()
284
+
285
+ # If time elapsed is greater than max_age_seconds, reparse
286
+ if time_seconds > max_age_seconds:
287
+ needs_reprocessing = True
288
+ print(f"\n\nLeaderboard {combined_uid} - {host} parsed more than {reprocess_interval_hours} hours ago ({format_datetime(result['parsed_at'])}), reprocessing necessary.")
289
+ else:
290
+ print(f"\n\nLeaderboard {combined_uid} - {host} already processed recently ({format_datetime(result['parsed_at'])}), moving to next. Age: {time_seconds} seconds (threshold: {max_age_seconds})")
291
+ continue
292
+ except (ValueError, TypeError):
293
+ # If date is invalid, reprocess by precaution
294
+ logger.info(f"Leaderboard {combined_uid} has an invalid processing date, reprocessing necessary.")
295
+ else:
296
+ # If parsing date is missing, reprocess by precaution
297
+ logger.info(f"Leaderboard {combined_uid} has no processing date, reprocessing necessary.")
298
+ else:
299
+ # If the leaderboard has never been processed, process it
300
+ logger.info(f"New leaderboard {combined_uid} to process.")
301
+
302
+ if needs_reprocessing or force_retry:
303
+ leaderboards_to_process.append({
304
+ "uid": leaderboard_uid,
305
+ "host": host,
306
+ "category": normalized_category,
307
+ "additional_rules": additional_rules,
308
+ "force_retry": force_retry
309
+ })
310
+
311
+ # Information on the number of leaderboards to process
312
+ logger.info(f"Total number of leaderboards to process: {len(leaderboards_to_process)}")
313
+
314
+ # Process each leaderboard
315
+ for index, leaderboard_info in enumerate(leaderboards_to_process):
316
+ leaderboard_uid = leaderboard_info["uid"]
317
+ host = leaderboard_info["host"]
318
+ category_name = leaderboard_info["category"]
319
+ additional_rules = leaderboard_info["additional_rules"]
320
+ force_retry = leaderboard_info["force_retry"]
321
+
322
+ # Process this leaderboard
323
+ logger.info(f"Processing leaderboard {index+1}/{len(leaderboards_to_process)}: {leaderboard_uid} (category: {category_name})")
324
+
325
+ try:
326
+ # Force restart of browser every 2 leaderboards to avoid memory leaks
327
+ if index > 0 and index % 2 == 0:
328
+ logger.info(f"Periodic browser cleanup after {index} leaderboards to avoid memory leaks")
329
+ cleanup_browser()
330
+ # Force garbage collection
331
+ import gc
332
+ gc.collect()
333
+ # Small pause to let the system clean up
334
+ time.sleep(3)
335
+
336
+ # Process the leaderboard
337
+ all_results = process_single_leaderboard(
338
+ leaderboard_uid,
339
+ host,
340
+ model,
341
+ index,
342
+ all_results,
343
+ additional_rules,
344
+ category_name
345
+ )
346
+
347
+ # Add detailed logs for diagnosing problems
348
+ logger.info(f"Results after processing: {len(all_results)} elements")
349
+ # Search for results corresponding to the processed leaderboard
350
+ for idx, res in enumerate(all_results):
351
+ if res.get("original_uid") == leaderboard_uid:
352
+ logger.info(f"Found result {idx}: uid={res.get('uid')}, original_uid={res.get('original_uid')}, category={res.get('category')}")
353
+
354
+ # Clean up after each processing
355
+ cleanup_browser()
356
+
357
+ # Verify if the leaderboard exists with the exact normalized category
358
+ # MODIFICATION: Strict search by original_uid AND category
359
+ normalized_category_name = create_category_slug(category_name)
360
+ current_result = None
361
+ for result in all_results:
362
+ # Always compare normalized categories to avoid format issues
363
+ result_category = result.get("category", "")
364
+ if result.get("original_uid") == leaderboard_uid and create_category_slug(result_category) == normalized_category_name:
365
+ current_result = result
366
+ logger.info(f"Found result for {leaderboard_uid}, category: {result.get('category')}")
367
+ break
368
+
369
+ # SUPPRESSION: No longer search for alternatives with only original_uid
370
+ # If result is not found, it's probably an error or processing failed
371
+ if not current_result:
372
+ logger.error(f"RESULT NOT FOUND for {leaderboard_uid}, normalized_category: {normalized_category_name}")
373
+ logger.error(f"Search for all results corresponding to this UID:")
374
+ for res in all_results:
375
+ if res.get("original_uid") == leaderboard_uid:
376
+ logger.error(f" - Result with category={res.get('category')}, uid={res.get('uid')}")
377
+ logger.error(f"Leaderboard {leaderboard_uid} (category: {category_name}) not updated because result not found")
378
+ continue
379
+
380
+ # Update only this specific leaderboard in the results file
381
+ logger.info(f"Updating leaderboard {leaderboard_uid} (category: {category_name}) in file")
382
+ updated_results = update_leaderboard_result(current_result, results_file)
383
+
384
+ # CORRECTION CRITIQUE: Update all_results list with file data
385
+ # to avoid desynchronization between file and in-memory list
386
+ all_results = updated_results
387
+
388
+ # Update global result for next leaderboard
389
+ results_data = updated_results
390
+
391
+ logger.info(f"Leaderboard {leaderboard_uid} (category: {category_name}) saved")
392
+
393
+ # Upload to HF Hub after each leaderboard if not in local-only mode
394
+ if not getattr(args, "local_only", False):
395
+ logger.info(f"Uploading results to HF Hub after processing leaderboard {leaderboard_uid}")
396
+ try:
397
+ upload_to_hub(to_parse_file=category_list_file, results_file=results_file)
398
+ logger.info(f"Upload successful to HF Hub for leaderboard {leaderboard_uid}")
399
+ except Exception as upload_err:
400
+ logger.warning(f"Upload to HF Hub failed after processing leaderboard {leaderboard_uid}: {str(upload_err)}")
401
+ except Exception as e:
402
+ logger.error(f"Error processing leaderboard {leaderboard_uid} (category: {category_name}): {str(e)}")
403
+ continue
404
+
405
+ # Final save not necessary as all leaderboards have already been updated individually
406
+ logger.info("Leaderboard processing completed")
407
+
408
+ update_server_status("completed")
409
+ return True, "Processing completed successfully"
410
+
411
+ except Exception as e:
412
+ update_server_status("failed", str(e))
413
+ logger.exception("Error processing leaderboards")
414
+ return False, f"Error processing leaderboards: {str(e)}"
src/scheduler.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Scheduling module for the leaderboard parser.
3
+ This module contains scheduling functions for periodic execution of leaderboard processing.
4
+ """
5
+ import datetime
6
+ import threading
7
+ import time
8
+ import logging
9
+ import os
10
+
11
+ # Initialize logger
12
+ logger = logging.getLogger("leaderboard-parser")
13
+
14
+ # Global variables for scheduler
15
+ stop_thread = False
16
+ last_run_time = None
17
+
18
+ # Reference to the processing function (will be defined by initialize_scheduler)
19
+ process_leaderboards_function = None
20
+
21
+ def initialize_scheduler(process_function):
22
+ """
23
+ Initialize the scheduler with the leaderboard processing function.
24
+
25
+ Args:
26
+ process_function: Function that processes leaderboards
27
+ """
28
+ global process_leaderboards_function
29
+ process_leaderboards_function = process_function
30
+ logger.info("Scheduler initialized with processing function")
31
+
32
+ def scheduler_thread():
33
+ """Thread that checks when to run the leaderboard processing job"""
34
+ global stop_thread, last_run_time, process_leaderboards_function
35
+
36
+ if not process_leaderboards_function:
37
+ logger.error("Scheduler has not been initialized with a processing function")
38
+ return
39
+
40
+ logger.info("Scheduler thread started")
41
+
42
+ # Get the reprocess interval from environment, default to 24 hours
43
+ interval_hours = int(os.environ.get("LEADERBOARD_REPROCESS_INTERVAL_HOURS", 24))
44
+ interval_seconds = interval_hours * 3600
45
+
46
+ logger.info(f"Leaderboard reprocess interval set to {interval_hours} hours")
47
+
48
+ while not stop_thread:
49
+ now = datetime.datetime.now()
50
+
51
+ # If no previous run or if the interval has passed since last run
52
+ if last_run_time is None or (now - last_run_time).total_seconds() >= interval_seconds:
53
+ logger.info(f"{interval_hours} hours have passed since last run, executing the job")
54
+
55
+ # Get the current status
56
+ from src.server import processing_status
57
+
58
+ # If we're not already processing
59
+ if processing_status != "running":
60
+ # Run the processing job
61
+ last_run_time = now
62
+ success, message = process_leaderboards_function({"local_only": False})
63
+ logger.info(f"Processing job completed with status: {success}, message: {message}")
64
+
65
+ # Wait at least 80% of the interval before checking again
66
+ # This prevents multiple executions and provides a buffer
67
+ time.sleep(interval_seconds * 0.8)
68
+ else:
69
+ # Calculate time until next run
70
+ seconds_until_next_run = interval_seconds - (now - last_run_time).total_seconds()
71
+ hours_until_next_run = seconds_until_next_run / 3600
72
+
73
+ # Log progress every hour
74
+ if int(seconds_until_next_run) % 3600 < 10: # Log within the first 10 seconds of each hour
75
+ logger.info(f"Next scheduled run in {hours_until_next_run:.1f} hours")
76
+
77
+ # Sleep for a minute before checking again
78
+ time.sleep(60)
79
+
80
+ def start_scheduler():
81
+ """Start the scheduler thread"""
82
+ global stop_thread
83
+
84
+ # Reset stop flag
85
+ stop_thread = False
86
+
87
+ # Start the scheduler thread
88
+ scheduler = threading.Thread(target=scheduler_thread)
89
+ scheduler.daemon = True
90
+ scheduler.start()
91
+
92
+ logger.info("Scheduler thread started")
93
+ return scheduler
94
+
95
+ def stop_scheduler():
96
+ """Stop the scheduler thread"""
97
+ global stop_thread
98
+ stop_thread = True
99
+ logger.info("Scheduler thread stop requested")
src/server.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module de serveur API pour le leaderboard parser.
3
+ Ce module contient la configuration FastAPI et les endpoints pour le mode serveur.
4
+ """
5
+ import datetime
6
+ import threading
7
+ import logging
8
+ import os
9
+ from fastapi import FastAPI, HTTPException
10
+ from fastapi.responses import JSONResponse
11
+ from src.file_utils import format_datetime
12
+
13
+ # Initialiser le logger
14
+ logger = logging.getLogger("leaderboard-parser")
15
+
16
+ # Variables globales pour suivre l'état du serveur
17
+ processing_status = "idle"
18
+ processing_error = None
19
+ last_run_time = None
20
+
21
+ # Initialiser l'application FastAPI
22
+ app = FastAPI(title="Leaderboard Parser API")
23
+
24
+ # Cette fonction sera importée depuis main.py
25
+ process_leaderboards = None
26
+
27
+ def initialize_server(process_function):
28
+ """
29
+ Initialise le serveur avec la fonction de traitement des leaderboards.
30
+ Cette fonction doit être appelée avant de démarrer le serveur.
31
+
32
+ Args:
33
+ process_function: Fonction qui traite les leaderboards
34
+ """
35
+ global process_leaderboards
36
+ process_leaderboards = process_function
37
+ logger.info("Serveur initialisé avec la fonction de traitement")
38
+
39
+ # Endpoints API
40
+ @app.get("/")
41
+ async def root():
42
+ """Root endpoint returning basic info"""
43
+ return {
44
+ "name": "Leaderboard Parser API",
45
+ "status": "running",
46
+ "version": "1.0.0"
47
+ }
48
+
49
+ @app.get("/status")
50
+ async def get_status():
51
+ """Get the current status of the parser"""
52
+ global processing_status, last_run_time, processing_error
53
+
54
+ return {
55
+ "status": processing_status,
56
+ "last_run": format_datetime(last_run_time) if last_run_time else None,
57
+ "next_run": format_datetime(last_run_time + datetime.timedelta(hours=int(os.environ.get("LEADERBOARD_REPROCESS_INTERVAL_HOURS", 24)))) if last_run_time else None,
58
+ "error": processing_error
59
+ }
60
+
61
+ @app.post("/run")
62
+ async def trigger_run():
63
+ """Manually trigger a leaderboard processing run"""
64
+ global processing_status, process_leaderboards
65
+
66
+ if not process_leaderboards:
67
+ raise HTTPException(status_code=500, detail="Server not properly initialized")
68
+
69
+ if processing_status == "running":
70
+ raise HTTPException(status_code=409, detail="Processing is already running")
71
+
72
+ # Start processing in a separate thread
73
+ threading.Thread(target=lambda: process_leaderboards()).start()
74
+
75
+ return {"status": "started", "message": "Processing started"}