Loren commited on
Commit
ff9afd7
·
verified ·
1 Parent(s): 9f1341c

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ thumbnail.jpg filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import re
4
+ import ast
5
+ import io
6
+ import os
7
+ from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
8
+ from pathlib import Path
9
+ import uuid
10
+ import warnings
11
+ warnings.filterwarnings("ignore")
12
+
13
+ ####################################################################
14
+ ### FUNCTIONS ###
15
+ ####################################################################
16
+
17
+ @st.cache_data(show_spinner=True)
18
+ def initializations():
19
+ st.session_state.question = ""
20
+ st.session_state.file_dataset = "./data/gaia_subset.csv"
21
+ st.session_state.file_evaluations = "./data/gaia_evals.csv"
22
+ st.session_state.gaia = True
23
+ st.session_state.file_lib = "./data/lib.md"
24
+ st.session_state.file_sidebar = "./data/gaia_sidebar.txt"
25
+ st.session_state.dfk = str(uuid.uuid4())
26
+ #
27
+
28
+ @st.cache_data(show_spinner=True)
29
+ def get_dataset(dataset_file):
30
+ return pd.read_csv(dataset_file, sep='µ', engine='python')
31
+ #
32
+
33
+ @st.cache_data(show_spinner=True)
34
+ def get_evaluations(eval_file):
35
+ def set_eval(answer1, answer2):
36
+ answer1 = re.sub(r'\.$', '', answer1.lower()).replace(', ', ',')
37
+ answer2 = re.sub(r'\.$', '', answer2.lower()).replace(', ', ',')
38
+ return answer1 == answer2
39
+
40
+ df = pd.read_csv(eval_file, sep='µ', engine='python')
41
+ df = df.merge(st.session_state.df_dataset[['task_id', 'question', 'file_url', 'answer']],
42
+ on='task_id', how='left')
43
+ list_labels = pd.unique(df['label'])
44
+ list_questions = pd.unique(df['question'])
45
+ df['eval'] = df.apply(lambda r: set_eval(str(r['submitted_answer']),
46
+ str(r['answer'])), axis=1)
47
+ df_pivot = df.pivot(index=['task_id','question'], columns='label',
48
+ values=['eval','submitted_answer','messages'])
49
+ df_reset = df_pivot.reindex(columns=list_labels, level=1).reset_index()
50
+ df_reset['question'] = pd.Categorical(df_reset['question'],
51
+ categories=list_questions, ordered=True)
52
+ df_eval = df_reset.sort_values('question')
53
+
54
+ df_synth = df.pivot(index='question', columns='label', values='eval') \
55
+ .reindex(columns=list_labels) \
56
+ .reindex(pd.unique(df_eval['question']))
57
+
58
+ totaux = df_synth.sum(axis=0)
59
+
60
+ df_perf = totaux.reset_index().T
61
+ df_perf.columns = df_perf.iloc[0]
62
+ df_perf = df_perf.iloc[1:]
63
+ df_perf.loc["Nb correct"] = totaux
64
+ df_perf.loc["% correct"] = totaux *100 / len(df_eval)
65
+ df_perf = df_perf.iloc[1:]
66
+
67
+ return df_eval, df_synth, df_perf, list_labels
68
+ #
69
+
70
+ @st.cache_data(show_spinner=True)
71
+ def get_lib(lib_file):
72
+ lib = ''
73
+ if isinstance(lib_file, str):
74
+ lib = Path(lib_file).read_text(encoding="utf-8")
75
+ else:
76
+ lib = lib_file.read().decode("utf-8")
77
+ return lib
78
+ #
79
+
80
+ @st.cache_data(show_spinner=True)
81
+ def get_sidebar(sidebar_file):
82
+ if isinstance(sidebar_file, str):
83
+ with open(sidebar_file, "r", encoding="utf-8") as f:
84
+ lignes = f.readlines()
85
+ else:
86
+ stringio = io.StringIO(sidebar_file.read().decode("utf-8"))
87
+ lignes = stringio.readlines()
88
+
89
+ return lignes
90
+ #
91
+
92
+ def parse_messages_from_string(messages_str):
93
+ messages = []
94
+ status = True
95
+ try:
96
+ messages_match = re.search(r"'messages': \[(.*)\]", messages_str, re.DOTALL)
97
+ messages_content = messages_match.group(1)
98
+ message_splits = re.findall(r'(HumanMessage\(.*?\)|AIMessage\(.*?\)|ToolMessage\(.*?\))(?=, HumanMessage\(|, AIMessage\(|, ToolMessage\(|$)', messages_content, re.DOTALL)
99
+
100
+ for msg_str in message_splits:
101
+ # Identifier le type de message
102
+ if msg_str.startswith('HumanMessage'):
103
+ msg_type = 'HumanMessage'
104
+ elif msg_str.startswith('AIMessage'):
105
+ msg_type = 'AIMessage'
106
+ elif msg_str.startswith('ToolMessage'):
107
+ msg_type = 'ToolMessage'
108
+ else:
109
+ continue # Type inconnu, passer au suivant
110
+
111
+ # Extraire les arguments du constructeur
112
+ args_str = msg_str[len(msg_type)+1:-1] # Supprimer 'TypeMessage(' et ')'
113
+ # Convertir les arguments en dictionnaire
114
+ # Remplacer les paires clé=valeur par des paires 'clé': valeur
115
+ args_str = re.sub(r'(\w+)=', r'"\1":', args_str)
116
+ try:
117
+ args = ast.literal_eval('{' + args_str + '}')
118
+ # Créer l'objet de message approprié
119
+ if msg_type == 'HumanMessage':
120
+ message = HumanMessage(**args)
121
+ elif msg_type == 'AIMessage':
122
+ message = AIMessage(**args)
123
+ elif msg_type == 'ToolMessage':
124
+ message = ToolMessage(**args)
125
+ else:
126
+ continue
127
+ messages.append(message)
128
+ except Exception as e:
129
+ message = HumanMessage(f"*** Error parsing message: {e}")
130
+ messages.append(message)
131
+ message = HumanMessage(f"*** See the original list of messages below")
132
+ messages.append(message)
133
+ status = False
134
+ print(f"Error parsing message: {e}")
135
+ continue
136
+ except Exception as e:
137
+ print(f"Erreur lors de l'analyse du messageparse_message_from_string: {e}")
138
+ finally:
139
+ return messages, status
140
+ #
141
+
142
+ def get_details():
143
+ dfkey = st.session_state.dfk
144
+ if len(st.session_state[dfkey]) > 0:
145
+ if len(st.session_state[dfkey]["selection"]["rows"]):
146
+ num_raw = st.session_state[dfkey]["selection"]["rows"][0]
147
+ df_eval = st.session_state.df_eval
148
+ st.session_state.question = df_eval.iloc[num_raw].question.squeeze()
149
+ for i in range(0, len(st.session_state.list_labels)):
150
+ with list_tabs[i].chat_message("ai"):
151
+ if df_eval.iloc[num_raw].eval[i]:
152
+ st.markdown(str(df_eval.iloc[num_raw].submitted_answer[i])+" "+
153
+ ":green-badge[:material/check: Correct]")
154
+ else:
155
+ st.markdown(str(df_eval.iloc[num_raw].submitted_answer[i]) + " " +
156
+ ":orange-badge[⚠️ Needs review]")
157
+ messages, status = parse_messages_from_string(df_eval.iloc[num_raw].messages[i])
158
+ c = st.container(border=True)
159
+ c.markdown("### Message history:")
160
+ c.text("\n".join(m.pretty_repr() for m in messages))
161
+ if not status:
162
+ c.text(df_eval.iloc[num_raw].messages[i])
163
+ #print("\n".join(m.pretty_repr() for m in messages))
164
+ #
165
+
166
+ def save_uploaded_file(uploaded_file, folder="data"):
167
+ os.makedirs(folder, exist_ok=True)
168
+ save_path = os.path.join(folder, uploaded_file.name)
169
+ with open(save_path, "wb") as f:
170
+ f.write(uploaded_file.getbuffer())
171
+ return save_path
172
+ #
173
+ ####################################################################
174
+ ### MAIN ###
175
+ ####################################################################
176
+
177
+ #--- Initializations
178
+ st.set_page_config(page_title='Agents evaluation',layout="wide",
179
+ initial_sidebar_state="auto")
180
+ initializations()
181
+
182
+ #--- Set title
183
+ if st.session_state.gaia:
184
+ col1, col2 = st.columns([0.4, 0.6], vertical_alignment="center")
185
+ col1.image("thumbnail.jpg")
186
+ col2.markdown("<h1 style='text-align: center; color: orange;'>GAIA subset evaluation</h1>",
187
+ unsafe_allow_html=True)
188
+ col1.link_button(":blue[More information]",
189
+ "https://huggingface.co/learn/agents-course/unit4/introduction")
190
+ pop = col2.container()
191
+ upd = col2.expander(":red[**Upload files to update app**]")
192
+ else:
193
+ st.markdown("<h1 style='text-align: center; color: orange;'>Agents evaluation</h1>",
194
+ unsafe_allow_html=True)
195
+ pop = st.container()
196
+ upd = st.expander(":red[**Upload files to update app**]")
197
+
198
+
199
+ #--- Popover
200
+ with pop.popover("### 💡 :red[**How to configure the app to use it with a different evaluation?**]",
201
+ use_container_width=True):
202
+ st.markdown("""You can modify the data the application is based on by **uploading** your own files, respecting the expected **formats**: \n
203
+ The **test dataset** must be a csv file with the **µ** separator character. The header line must contain the expected **fields**: \n
204
+ >***task id, question, file name, file url ,answer.*** \n
205
+ >*task_id, question, file_name, file_url, answer* \n
206
+ *Example of test dataset:*""")
207
+ st.code("""task_idµquestionµfile_nameµfile_urlµanswer \n
208
+ 2d83110e-a098-4ebb-9987-066c06fa42d0µ".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"µµµright \n
209
+ """, language=None)
210
+ st.markdown("___")
211
+ st.markdown("""The **evaluation dataset** must also be a csv file with the **µ** separator character. The header line must contain the expected **fields**: \n
212
+ >***label of the agent, task id, agent's response, message history (a string formatted as a list of HumanMessage, AIMessage, ToolMessage from Langchain).*** \n
213
+ >*label, task_id, submitted_answer, messages* \n
214
+ *Example of evaluation dataset:*""")
215
+ st.code("""labelµtask_idµsubmitted_answerµmessages
216
+ Qwen2.5-72B-Instructµ2d83110e-a098-4ebb-9987-066c06fa42d0µrightµ"{'messages': [HumanMessage(content='.rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI', additional_kwargs={}, response_metadata={}, id='98460ac1-f0c0-41dc-8f32-ddf50b123a71'), AIMessage(content='The user wrote a sentence in reverse. ... There\'s no need for any tools here because this is a basic vocabulary question. ... Therefore, the final answer is ""right.""\n</think>\n\nFINAL ANSWER: right', additional_kwargs={}, response_metadata={...}, 'model_name': 'Qwen/Qwen3-235B-A22B', ...}, ..."
217
+ """, language=None)
218
+ st.markdown("___")
219
+ st.markdown("""You can also set your **title** and your **sidebar** by **uploading** appropriate files: \n
220
+ * a md or txt file for the title. \n
221
+ *Example:*""")
222
+ st.code("""*GAIA is a benchmark which aims at ...*
223
+ ***Data***
224
+ *GAIA is made of more than 450 non-trivial question with an unambiguous answer, ...*
225
+ """, language=None)
226
+ st.markdown("""* a text file describing, in markdown, the section titles and tool descriptions. \n
227
+ *Example:*""")
228
+ st.code("""title;:orange[Langchain tools]
229
+ tool;:material/language: TavilySearch
230
+ tool;:material/newsstand: WikipediaQueryRun
231
+ title;:orange[Custom tools]
232
+ tool;:material/slideshow: Ask Youtube video
233
+ tool;:material/chess: Chessboard description
234
+ tool;:material/speech_to_text: Audio transcription
235
+ tool;:material/text_snippet: Get file content
236
+ tool;:material/add: Sum numbers
237
+ """, language=None)
238
+
239
+ #--- Update app configuration
240
+ with upd.form(":red[**Update app**]"):
241
+ uploaded_dataset = st.file_uploader("Choose the **dataset** file:", type='csv')
242
+ uploaded_evaluations = st.file_uploader("Choose the **evaluation**s file:", type='csv')
243
+ uploaded_lib = st.file_uploader("Choose the file with the dataset **description**:", type=['md', 'txt'])
244
+ uploaded_sidebar = st.file_uploader("Choose the file with the **sidebar** description:", type=['md', 'txt'])
245
+ valid = st.form_submit_button("🚀 :red[**Update app**]")
246
+ if valid:
247
+ if uploaded_lib is not None:
248
+ st.session_state.gaia = False
249
+ st.session_state.file_lib = uploaded_lib
250
+ if uploaded_dataset is not None:
251
+ st.session_state.file_dataset = uploaded_dataset
252
+ st.session_state.question = ""
253
+ if uploaded_evaluations is not None:
254
+ st.session_state.file_evaluations = save_uploaded_file(uploaded_evaluations)
255
+ print('fichier sauvegardé : ', st.session_state.file_evaluations)
256
+ st.session_state.dfk = str(uuid.uuid4())
257
+ st.session_state.question = ""
258
+ if 'list_tabs' in locals():
259
+ del list_tabs
260
+ if uploaded_sidebar is not None:
261
+ st.session_state.file_sidebar = uploaded_sidebar
262
+
263
+
264
+ #--- Get dataset information
265
+ try:
266
+ st.session_state.lib = get_lib(st.session_state.file_lib)
267
+ except Exception as e:
268
+ st.exception(f'Error during get_lib: {e}')
269
+
270
+ #--- Get sidebar description
271
+ try:
272
+ st.session_state.lignes = get_sidebar(st.session_state.file_sidebar)
273
+ except Exception as e:
274
+ st.exception(f'Error during get_sidebar: {e}')
275
+
276
+ #--- Set sidebar
277
+ try:
278
+ with st.sidebar:
279
+ st.markdown("# :material/construction: Tools used")
280
+ for ligne in st.session_state.lignes:
281
+ lig = ligne.split(";")
282
+ if lig[0] == 'title':
283
+ st.markdown("## "+lig[1])
284
+ if lig[0] == 'tool':
285
+ with st.container(border=True):
286
+ st.markdown("### "+lig[1])
287
+ except Exception as e:
288
+ st.exception(f'Error during set sidebar: {e}')
289
+
290
+ #--- Get dataset
291
+ try:
292
+ st.session_state.df_dataset = get_dataset(st.session_state.file_dataset)
293
+ except Exception as e:
294
+ st.exception(f'Error during get_dataset: {e}')
295
+
296
+ #--- Get evaluations
297
+ try:
298
+ st.session_state.df_eval, st.session_state.df_synth, st.session_state.df_perf, \
299
+ st.session_state.list_labels = get_evaluations(st.session_state.file_evaluations)
300
+ except Exception as e:
301
+ st.exception(f'Error during get_evaluations: {e}')
302
+
303
+
304
+ #--- Show dataset expander
305
+ with st.expander("## **:orange[Dataset informations]**", expanded=False):
306
+ try:
307
+ st.markdown(">"+st.session_state.lib)
308
+ st.markdown("#### Test dataset:")
309
+ st.dataframe(st.session_state.df_dataset[['question', 'file_url']],
310
+ column_config={"file_url": st.column_config.LinkColumn("Attached file",
311
+ display_text="Download attached file"),
312
+ "question": st.column_config.TextColumn(max_chars=None)})
313
+ except Exception as e:
314
+ st.exception(f'Error in dataset informations: {e}')
315
+
316
+ #--- Show perf dataframe
317
+ st.dataframe(st.session_state.df_perf)
318
+
319
+ #--- Show evaluations synthesys
320
+ st.markdown("👇 Click to the left of the question to obtain details of the different model evaluations")
321
+ st.dataframe(st.session_state.df_synth, on_select=get_details, key=st.session_state.dfk,
322
+ selection_mode="single-row")
323
+
324
+ #--- Details container
325
+ cont = st.container()
326
+
327
+ with cont.chat_message('user'):
328
+ st.markdown(f'###### :blue[{st.session_state.question}]')
329
+
330
+ cols = [''.join(col).strip() for col in st.session_state.list_labels]
331
+
332
+ list_tabs = cont.tabs(cols)
data/gaia_evals.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/gaia_sidebar.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ title;:orange[Langchain tools]
2
+ tool;:material/language: TavilySearch
3
+ tool;:material/newsstand: WikipediaQueryRun
4
+ title;:orange[Custom tools]
5
+ tool;:material/slideshow: Ask Youtube video
6
+ tool;:material/chess: Chessboard description
7
+ tool;:material/speech_to_text: Audio transcription
8
+ tool;:material/text_snippet: Get file content
9
+ tool;:material/add: Sum numbers
data/gaia_subset.csv ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task_idµquestionµlevelµfile_nameµfile_urlµanswer
2
+ 8e867cd7-cff9-4e6c-867a-ff5ddc2550beµHow many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.µ1µµµ3
3
+ a1e91b78-d3d8-4675-bb8d-62741b4b68a6µIn the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?µ1µµµ3
4
+ 2d83110e-a098-4ebb-9987-066c06fa42d0µ".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"µ1µµµright
5
+ cca530fc-4052-43b2-b130-b30968d8aa44µReview the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.µ1µcca530fc-4052-43b2-b130-b30968d8aa44.pngµhttps://agents-course-unit4-scoring.hf.space/files/cca530fc-4052-43b2-b130-b30968d8aa44µRd5
6
+ 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8µWho nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?µ1µµµFunkMonk
7
+ 6f37996b-2ac7-44b0-8e68-6d28256631b4µ"Given this table defining * on the set S = {a, b, c, d, e}
8
+
9
+ |*|a|b|c|d|e|
10
+ |---|---|---|---|---|---|
11
+ |a|a|b|c|b|d|
12
+ |b|b|c|a|e|c|
13
+ |c|c|a|b|b|a|
14
+ |d|b|e|b|e|d|
15
+ |e|d|b|a|d|c|
16
+
17
+ provide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order."µ1µµµb,e
18
+ 9d191bce-651d-4746-be2d-7ef8ecadb9c2µ"Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.
19
+
20
+ What does Teal'c say in response to the question ""Isn't that hot?"""µ1µµµExtremely
21
+ cabe07ed-9eca-40ea-8ead-410ef5e83f91µWhat is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?µ1µµµLouvrier
22
+ 3cef3a44-215e-4aed-8e3b-b1e3f08063b7µ"I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:
23
+
24
+ milk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts
25
+
26
+ I need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list."µ1µµµbroccoli, celery, fresh basil, lettuce, sweet potatoes
27
+ 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3µ"Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.
28
+
29
+ In your response, please only list the ingredients, not any measurements. So if the recipe calls for ""a pinch of salt"" or ""two cups of ripe strawberries"" the ingredients on the list would be ""salt"" and ""ripe strawberries"".
30
+
31
+ Please format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients."µ1µ99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3µhttps://agents-course-unit4-scoring.hf.space/files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3µcornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries
32
+ 305ac316-eef6-4446-960a-92d80d542f82µWho did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.µ1µµµWojciech
33
+ f918266a-b3e0-4914-865d-4faa564f1aefµWhat is the final numeric output from the attached Python code?µ1µf918266a-b3e0-4914-865d-4faa564f1aef.pyµhttps://agents-course-unit4-scoring.hf.space/files/f918266a-b3e0-4914-865d-4faa564f1aefµ0
34
+ 3f57289b-8c60-48be-bd80-01f8099ca449µHow many at bats did the Yankee with the most walks in the 1977 regular season have that same season?µ1µµµ519
35
+ 1f975693-876d-457b-a649-393859e79bf3µ"Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(
36
+
37
+ Could you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order."µ1µ1f975693-876d-457b-a649-393859e79bf3.mp3µhttps://agents-course-unit4-scoring.hf.space/files/1f975693-876d-457b-a649-393859e79bf3µ132, 133, 134, 197, 245
38
+ 840bfca7-4f7b-481a-8794-c560c340185dµOn June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?µ1µµµ80GSFC21M0002
39
+ bda648d7-d618-4883-88f4-3466eabd860eµWhere were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.µ1µµµSaint Petersburg
40
+ cf106601-ab4f-4af9-b045-5295fe67b37dµWhat country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.µ1µµµCUB
41
+ a0c07678-e491-4bbc-8f0b-07405144218fµWho are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.µ1µµµYoshida, Uehara
42
+ 7bd855d8-463d-4ed5-93ca-5fe35145f733µThe attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.µ1µ7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsxµhttps://agents-course-unit4-scoring.hf.space/files/7bd855d8-463d-4ed5-93ca-5fe35145f733µ89706.00
43
+ 5a0c1adf-205e-4841-a666-7c3ef95def9dµWhat is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?µ1µµµClaus
data/lib.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *GAIA is a benchmark which aims at evaluating next-generation LLMs (LLMs with augmented capabilities due to added tooling, efficient prompting, access to search, etc).*
2
+ ***Data***
3
+ *GAIA is made of more than 450 non-trivial question with an unambiguous answer, requiring different levels of tooling and autonomy to solve. It is therefore divided in 3 levels, where level 1 should be breakable by very good LLMs, and level 3 indicate a strong jump in model capabilities. Each level is divided into a fully public dev set for validation, and a test set with private answers and metadata.*
thumbnail.jpg ADDED

Git LFS Details

  • SHA256: c4046208a65e001346da1139b831b35c8488cb765996ff6feaea815e07da9074
  • Pointer size: 132 Bytes
  • Size of remote file: 1.18 MB