Spaces:
Sleeping
Sleeping
Upload 6 files
Browse files- .gitattributes +1 -0
- app.py +332 -0
- data/gaia_evals.csv +0 -0
- data/gaia_sidebar.txt +9 -0
- data/gaia_subset.csv +43 -0
- data/lib.md +3 -0
- thumbnail.jpg +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
thumbnail.jpg filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import re
|
| 4 |
+
import ast
|
| 5 |
+
import io
|
| 6 |
+
import os
|
| 7 |
+
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import uuid
|
| 10 |
+
import warnings
|
| 11 |
+
warnings.filterwarnings("ignore")
|
| 12 |
+
|
| 13 |
+
####################################################################
|
| 14 |
+
### FUNCTIONS ###
|
| 15 |
+
####################################################################
|
| 16 |
+
|
| 17 |
+
@st.cache_data(show_spinner=True)
|
| 18 |
+
def initializations():
|
| 19 |
+
st.session_state.question = ""
|
| 20 |
+
st.session_state.file_dataset = "./data/gaia_subset.csv"
|
| 21 |
+
st.session_state.file_evaluations = "./data/gaia_evals.csv"
|
| 22 |
+
st.session_state.gaia = True
|
| 23 |
+
st.session_state.file_lib = "./data/lib.md"
|
| 24 |
+
st.session_state.file_sidebar = "./data/gaia_sidebar.txt"
|
| 25 |
+
st.session_state.dfk = str(uuid.uuid4())
|
| 26 |
+
#
|
| 27 |
+
|
| 28 |
+
@st.cache_data(show_spinner=True)
|
| 29 |
+
def get_dataset(dataset_file):
|
| 30 |
+
return pd.read_csv(dataset_file, sep='µ', engine='python')
|
| 31 |
+
#
|
| 32 |
+
|
| 33 |
+
@st.cache_data(show_spinner=True)
|
| 34 |
+
def get_evaluations(eval_file):
|
| 35 |
+
def set_eval(answer1, answer2):
|
| 36 |
+
answer1 = re.sub(r'\.$', '', answer1.lower()).replace(', ', ',')
|
| 37 |
+
answer2 = re.sub(r'\.$', '', answer2.lower()).replace(', ', ',')
|
| 38 |
+
return answer1 == answer2
|
| 39 |
+
|
| 40 |
+
df = pd.read_csv(eval_file, sep='µ', engine='python')
|
| 41 |
+
df = df.merge(st.session_state.df_dataset[['task_id', 'question', 'file_url', 'answer']],
|
| 42 |
+
on='task_id', how='left')
|
| 43 |
+
list_labels = pd.unique(df['label'])
|
| 44 |
+
list_questions = pd.unique(df['question'])
|
| 45 |
+
df['eval'] = df.apply(lambda r: set_eval(str(r['submitted_answer']),
|
| 46 |
+
str(r['answer'])), axis=1)
|
| 47 |
+
df_pivot = df.pivot(index=['task_id','question'], columns='label',
|
| 48 |
+
values=['eval','submitted_answer','messages'])
|
| 49 |
+
df_reset = df_pivot.reindex(columns=list_labels, level=1).reset_index()
|
| 50 |
+
df_reset['question'] = pd.Categorical(df_reset['question'],
|
| 51 |
+
categories=list_questions, ordered=True)
|
| 52 |
+
df_eval = df_reset.sort_values('question')
|
| 53 |
+
|
| 54 |
+
df_synth = df.pivot(index='question', columns='label', values='eval') \
|
| 55 |
+
.reindex(columns=list_labels) \
|
| 56 |
+
.reindex(pd.unique(df_eval['question']))
|
| 57 |
+
|
| 58 |
+
totaux = df_synth.sum(axis=0)
|
| 59 |
+
|
| 60 |
+
df_perf = totaux.reset_index().T
|
| 61 |
+
df_perf.columns = df_perf.iloc[0]
|
| 62 |
+
df_perf = df_perf.iloc[1:]
|
| 63 |
+
df_perf.loc["Nb correct"] = totaux
|
| 64 |
+
df_perf.loc["% correct"] = totaux *100 / len(df_eval)
|
| 65 |
+
df_perf = df_perf.iloc[1:]
|
| 66 |
+
|
| 67 |
+
return df_eval, df_synth, df_perf, list_labels
|
| 68 |
+
#
|
| 69 |
+
|
| 70 |
+
@st.cache_data(show_spinner=True)
|
| 71 |
+
def get_lib(lib_file):
|
| 72 |
+
lib = ''
|
| 73 |
+
if isinstance(lib_file, str):
|
| 74 |
+
lib = Path(lib_file).read_text(encoding="utf-8")
|
| 75 |
+
else:
|
| 76 |
+
lib = lib_file.read().decode("utf-8")
|
| 77 |
+
return lib
|
| 78 |
+
#
|
| 79 |
+
|
| 80 |
+
@st.cache_data(show_spinner=True)
|
| 81 |
+
def get_sidebar(sidebar_file):
|
| 82 |
+
if isinstance(sidebar_file, str):
|
| 83 |
+
with open(sidebar_file, "r", encoding="utf-8") as f:
|
| 84 |
+
lignes = f.readlines()
|
| 85 |
+
else:
|
| 86 |
+
stringio = io.StringIO(sidebar_file.read().decode("utf-8"))
|
| 87 |
+
lignes = stringio.readlines()
|
| 88 |
+
|
| 89 |
+
return lignes
|
| 90 |
+
#
|
| 91 |
+
|
| 92 |
+
def parse_messages_from_string(messages_str):
|
| 93 |
+
messages = []
|
| 94 |
+
status = True
|
| 95 |
+
try:
|
| 96 |
+
messages_match = re.search(r"'messages': \[(.*)\]", messages_str, re.DOTALL)
|
| 97 |
+
messages_content = messages_match.group(1)
|
| 98 |
+
message_splits = re.findall(r'(HumanMessage\(.*?\)|AIMessage\(.*?\)|ToolMessage\(.*?\))(?=, HumanMessage\(|, AIMessage\(|, ToolMessage\(|$)', messages_content, re.DOTALL)
|
| 99 |
+
|
| 100 |
+
for msg_str in message_splits:
|
| 101 |
+
# Identifier le type de message
|
| 102 |
+
if msg_str.startswith('HumanMessage'):
|
| 103 |
+
msg_type = 'HumanMessage'
|
| 104 |
+
elif msg_str.startswith('AIMessage'):
|
| 105 |
+
msg_type = 'AIMessage'
|
| 106 |
+
elif msg_str.startswith('ToolMessage'):
|
| 107 |
+
msg_type = 'ToolMessage'
|
| 108 |
+
else:
|
| 109 |
+
continue # Type inconnu, passer au suivant
|
| 110 |
+
|
| 111 |
+
# Extraire les arguments du constructeur
|
| 112 |
+
args_str = msg_str[len(msg_type)+1:-1] # Supprimer 'TypeMessage(' et ')'
|
| 113 |
+
# Convertir les arguments en dictionnaire
|
| 114 |
+
# Remplacer les paires clé=valeur par des paires 'clé': valeur
|
| 115 |
+
args_str = re.sub(r'(\w+)=', r'"\1":', args_str)
|
| 116 |
+
try:
|
| 117 |
+
args = ast.literal_eval('{' + args_str + '}')
|
| 118 |
+
# Créer l'objet de message approprié
|
| 119 |
+
if msg_type == 'HumanMessage':
|
| 120 |
+
message = HumanMessage(**args)
|
| 121 |
+
elif msg_type == 'AIMessage':
|
| 122 |
+
message = AIMessage(**args)
|
| 123 |
+
elif msg_type == 'ToolMessage':
|
| 124 |
+
message = ToolMessage(**args)
|
| 125 |
+
else:
|
| 126 |
+
continue
|
| 127 |
+
messages.append(message)
|
| 128 |
+
except Exception as e:
|
| 129 |
+
message = HumanMessage(f"*** Error parsing message: {e}")
|
| 130 |
+
messages.append(message)
|
| 131 |
+
message = HumanMessage(f"*** See the original list of messages below")
|
| 132 |
+
messages.append(message)
|
| 133 |
+
status = False
|
| 134 |
+
print(f"Error parsing message: {e}")
|
| 135 |
+
continue
|
| 136 |
+
except Exception as e:
|
| 137 |
+
print(f"Erreur lors de l'analyse du messageparse_message_from_string: {e}")
|
| 138 |
+
finally:
|
| 139 |
+
return messages, status
|
| 140 |
+
#
|
| 141 |
+
|
| 142 |
+
def get_details():
|
| 143 |
+
dfkey = st.session_state.dfk
|
| 144 |
+
if len(st.session_state[dfkey]) > 0:
|
| 145 |
+
if len(st.session_state[dfkey]["selection"]["rows"]):
|
| 146 |
+
num_raw = st.session_state[dfkey]["selection"]["rows"][0]
|
| 147 |
+
df_eval = st.session_state.df_eval
|
| 148 |
+
st.session_state.question = df_eval.iloc[num_raw].question.squeeze()
|
| 149 |
+
for i in range(0, len(st.session_state.list_labels)):
|
| 150 |
+
with list_tabs[i].chat_message("ai"):
|
| 151 |
+
if df_eval.iloc[num_raw].eval[i]:
|
| 152 |
+
st.markdown(str(df_eval.iloc[num_raw].submitted_answer[i])+" "+
|
| 153 |
+
":green-badge[:material/check: Correct]")
|
| 154 |
+
else:
|
| 155 |
+
st.markdown(str(df_eval.iloc[num_raw].submitted_answer[i]) + " " +
|
| 156 |
+
":orange-badge[⚠️ Needs review]")
|
| 157 |
+
messages, status = parse_messages_from_string(df_eval.iloc[num_raw].messages[i])
|
| 158 |
+
c = st.container(border=True)
|
| 159 |
+
c.markdown("### Message history:")
|
| 160 |
+
c.text("\n".join(m.pretty_repr() for m in messages))
|
| 161 |
+
if not status:
|
| 162 |
+
c.text(df_eval.iloc[num_raw].messages[i])
|
| 163 |
+
#print("\n".join(m.pretty_repr() for m in messages))
|
| 164 |
+
#
|
| 165 |
+
|
| 166 |
+
def save_uploaded_file(uploaded_file, folder="data"):
|
| 167 |
+
os.makedirs(folder, exist_ok=True)
|
| 168 |
+
save_path = os.path.join(folder, uploaded_file.name)
|
| 169 |
+
with open(save_path, "wb") as f:
|
| 170 |
+
f.write(uploaded_file.getbuffer())
|
| 171 |
+
return save_path
|
| 172 |
+
#
|
| 173 |
+
####################################################################
|
| 174 |
+
### MAIN ###
|
| 175 |
+
####################################################################
|
| 176 |
+
|
| 177 |
+
#--- Initializations
|
| 178 |
+
st.set_page_config(page_title='Agents evaluation',layout="wide",
|
| 179 |
+
initial_sidebar_state="auto")
|
| 180 |
+
initializations()
|
| 181 |
+
|
| 182 |
+
#--- Set title
|
| 183 |
+
if st.session_state.gaia:
|
| 184 |
+
col1, col2 = st.columns([0.4, 0.6], vertical_alignment="center")
|
| 185 |
+
col1.image("thumbnail.jpg")
|
| 186 |
+
col2.markdown("<h1 style='text-align: center; color: orange;'>GAIA subset evaluation</h1>",
|
| 187 |
+
unsafe_allow_html=True)
|
| 188 |
+
col1.link_button(":blue[More information]",
|
| 189 |
+
"https://huggingface.co/learn/agents-course/unit4/introduction")
|
| 190 |
+
pop = col2.container()
|
| 191 |
+
upd = col2.expander(":red[**Upload files to update app**]")
|
| 192 |
+
else:
|
| 193 |
+
st.markdown("<h1 style='text-align: center; color: orange;'>Agents evaluation</h1>",
|
| 194 |
+
unsafe_allow_html=True)
|
| 195 |
+
pop = st.container()
|
| 196 |
+
upd = st.expander(":red[**Upload files to update app**]")
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
#--- Popover
|
| 200 |
+
with pop.popover("### 💡 :red[**How to configure the app to use it with a different evaluation?**]",
|
| 201 |
+
use_container_width=True):
|
| 202 |
+
st.markdown("""You can modify the data the application is based on by **uploading** your own files, respecting the expected **formats**: \n
|
| 203 |
+
The **test dataset** must be a csv file with the **µ** separator character. The header line must contain the expected **fields**: \n
|
| 204 |
+
>***task id, question, file name, file url ,answer.*** \n
|
| 205 |
+
>*task_id, question, file_name, file_url, answer* \n
|
| 206 |
+
*Example of test dataset:*""")
|
| 207 |
+
st.code("""task_idµquestionµfile_nameµfile_urlµanswer \n
|
| 208 |
+
2d83110e-a098-4ebb-9987-066c06fa42d0µ".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"µµµright \n
|
| 209 |
+
""", language=None)
|
| 210 |
+
st.markdown("___")
|
| 211 |
+
st.markdown("""The **evaluation dataset** must also be a csv file with the **µ** separator character. The header line must contain the expected **fields**: \n
|
| 212 |
+
>***label of the agent, task id, agent's response, message history (a string formatted as a list of HumanMessage, AIMessage, ToolMessage from Langchain).*** \n
|
| 213 |
+
>*label, task_id, submitted_answer, messages* \n
|
| 214 |
+
*Example of evaluation dataset:*""")
|
| 215 |
+
st.code("""labelµtask_idµsubmitted_answerµmessages
|
| 216 |
+
Qwen2.5-72B-Instructµ2d83110e-a098-4ebb-9987-066c06fa42d0µrightµ"{'messages': [HumanMessage(content='.rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI', additional_kwargs={}, response_metadata={}, id='98460ac1-f0c0-41dc-8f32-ddf50b123a71'), AIMessage(content='The user wrote a sentence in reverse. ... There\'s no need for any tools here because this is a basic vocabulary question. ... Therefore, the final answer is ""right.""\n</think>\n\nFINAL ANSWER: right', additional_kwargs={}, response_metadata={...}, 'model_name': 'Qwen/Qwen3-235B-A22B', ...}, ..."
|
| 217 |
+
""", language=None)
|
| 218 |
+
st.markdown("___")
|
| 219 |
+
st.markdown("""You can also set your **title** and your **sidebar** by **uploading** appropriate files: \n
|
| 220 |
+
* a md or txt file for the title. \n
|
| 221 |
+
*Example:*""")
|
| 222 |
+
st.code("""*GAIA is a benchmark which aims at ...*
|
| 223 |
+
***Data***
|
| 224 |
+
*GAIA is made of more than 450 non-trivial question with an unambiguous answer, ...*
|
| 225 |
+
""", language=None)
|
| 226 |
+
st.markdown("""* a text file describing, in markdown, the section titles and tool descriptions. \n
|
| 227 |
+
*Example:*""")
|
| 228 |
+
st.code("""title;:orange[Langchain tools]
|
| 229 |
+
tool;:material/language: TavilySearch
|
| 230 |
+
tool;:material/newsstand: WikipediaQueryRun
|
| 231 |
+
title;:orange[Custom tools]
|
| 232 |
+
tool;:material/slideshow: Ask Youtube video
|
| 233 |
+
tool;:material/chess: Chessboard description
|
| 234 |
+
tool;:material/speech_to_text: Audio transcription
|
| 235 |
+
tool;:material/text_snippet: Get file content
|
| 236 |
+
tool;:material/add: Sum numbers
|
| 237 |
+
""", language=None)
|
| 238 |
+
|
| 239 |
+
#--- Update app configuration
|
| 240 |
+
with upd.form(":red[**Update app**]"):
|
| 241 |
+
uploaded_dataset = st.file_uploader("Choose the **dataset** file:", type='csv')
|
| 242 |
+
uploaded_evaluations = st.file_uploader("Choose the **evaluation**s file:", type='csv')
|
| 243 |
+
uploaded_lib = st.file_uploader("Choose the file with the dataset **description**:", type=['md', 'txt'])
|
| 244 |
+
uploaded_sidebar = st.file_uploader("Choose the file with the **sidebar** description:", type=['md', 'txt'])
|
| 245 |
+
valid = st.form_submit_button("🚀 :red[**Update app**]")
|
| 246 |
+
if valid:
|
| 247 |
+
if uploaded_lib is not None:
|
| 248 |
+
st.session_state.gaia = False
|
| 249 |
+
st.session_state.file_lib = uploaded_lib
|
| 250 |
+
if uploaded_dataset is not None:
|
| 251 |
+
st.session_state.file_dataset = uploaded_dataset
|
| 252 |
+
st.session_state.question = ""
|
| 253 |
+
if uploaded_evaluations is not None:
|
| 254 |
+
st.session_state.file_evaluations = save_uploaded_file(uploaded_evaluations)
|
| 255 |
+
print('fichier sauvegardé : ', st.session_state.file_evaluations)
|
| 256 |
+
st.session_state.dfk = str(uuid.uuid4())
|
| 257 |
+
st.session_state.question = ""
|
| 258 |
+
if 'list_tabs' in locals():
|
| 259 |
+
del list_tabs
|
| 260 |
+
if uploaded_sidebar is not None:
|
| 261 |
+
st.session_state.file_sidebar = uploaded_sidebar
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
#--- Get dataset information
|
| 265 |
+
try:
|
| 266 |
+
st.session_state.lib = get_lib(st.session_state.file_lib)
|
| 267 |
+
except Exception as e:
|
| 268 |
+
st.exception(f'Error during get_lib: {e}')
|
| 269 |
+
|
| 270 |
+
#--- Get sidebar description
|
| 271 |
+
try:
|
| 272 |
+
st.session_state.lignes = get_sidebar(st.session_state.file_sidebar)
|
| 273 |
+
except Exception as e:
|
| 274 |
+
st.exception(f'Error during get_sidebar: {e}')
|
| 275 |
+
|
| 276 |
+
#--- Set sidebar
|
| 277 |
+
try:
|
| 278 |
+
with st.sidebar:
|
| 279 |
+
st.markdown("# :material/construction: Tools used")
|
| 280 |
+
for ligne in st.session_state.lignes:
|
| 281 |
+
lig = ligne.split(";")
|
| 282 |
+
if lig[0] == 'title':
|
| 283 |
+
st.markdown("## "+lig[1])
|
| 284 |
+
if lig[0] == 'tool':
|
| 285 |
+
with st.container(border=True):
|
| 286 |
+
st.markdown("### "+lig[1])
|
| 287 |
+
except Exception as e:
|
| 288 |
+
st.exception(f'Error during set sidebar: {e}')
|
| 289 |
+
|
| 290 |
+
#--- Get dataset
|
| 291 |
+
try:
|
| 292 |
+
st.session_state.df_dataset = get_dataset(st.session_state.file_dataset)
|
| 293 |
+
except Exception as e:
|
| 294 |
+
st.exception(f'Error during get_dataset: {e}')
|
| 295 |
+
|
| 296 |
+
#--- Get evaluations
|
| 297 |
+
try:
|
| 298 |
+
st.session_state.df_eval, st.session_state.df_synth, st.session_state.df_perf, \
|
| 299 |
+
st.session_state.list_labels = get_evaluations(st.session_state.file_evaluations)
|
| 300 |
+
except Exception as e:
|
| 301 |
+
st.exception(f'Error during get_evaluations: {e}')
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
#--- Show dataset expander
|
| 305 |
+
with st.expander("## **:orange[Dataset informations]**", expanded=False):
|
| 306 |
+
try:
|
| 307 |
+
st.markdown(">"+st.session_state.lib)
|
| 308 |
+
st.markdown("#### Test dataset:")
|
| 309 |
+
st.dataframe(st.session_state.df_dataset[['question', 'file_url']],
|
| 310 |
+
column_config={"file_url": st.column_config.LinkColumn("Attached file",
|
| 311 |
+
display_text="Download attached file"),
|
| 312 |
+
"question": st.column_config.TextColumn(max_chars=None)})
|
| 313 |
+
except Exception as e:
|
| 314 |
+
st.exception(f'Error in dataset informations: {e}')
|
| 315 |
+
|
| 316 |
+
#--- Show perf dataframe
|
| 317 |
+
st.dataframe(st.session_state.df_perf)
|
| 318 |
+
|
| 319 |
+
#--- Show evaluations synthesys
|
| 320 |
+
st.markdown("👇 Click to the left of the question to obtain details of the different model evaluations")
|
| 321 |
+
st.dataframe(st.session_state.df_synth, on_select=get_details, key=st.session_state.dfk,
|
| 322 |
+
selection_mode="single-row")
|
| 323 |
+
|
| 324 |
+
#--- Details container
|
| 325 |
+
cont = st.container()
|
| 326 |
+
|
| 327 |
+
with cont.chat_message('user'):
|
| 328 |
+
st.markdown(f'###### :blue[{st.session_state.question}]')
|
| 329 |
+
|
| 330 |
+
cols = [''.join(col).strip() for col in st.session_state.list_labels]
|
| 331 |
+
|
| 332 |
+
list_tabs = cont.tabs(cols)
|
data/gaia_evals.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/gaia_sidebar.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
title;:orange[Langchain tools]
|
| 2 |
+
tool;:material/language: TavilySearch
|
| 3 |
+
tool;:material/newsstand: WikipediaQueryRun
|
| 4 |
+
title;:orange[Custom tools]
|
| 5 |
+
tool;:material/slideshow: Ask Youtube video
|
| 6 |
+
tool;:material/chess: Chessboard description
|
| 7 |
+
tool;:material/speech_to_text: Audio transcription
|
| 8 |
+
tool;:material/text_snippet: Get file content
|
| 9 |
+
tool;:material/add: Sum numbers
|
data/gaia_subset.csv
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task_idµquestionµlevelµfile_nameµfile_urlµanswer
|
| 2 |
+
8e867cd7-cff9-4e6c-867a-ff5ddc2550beµHow many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.µ1µµµ3
|
| 3 |
+
a1e91b78-d3d8-4675-bb8d-62741b4b68a6µIn the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?µ1µµµ3
|
| 4 |
+
2d83110e-a098-4ebb-9987-066c06fa42d0µ".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"µ1µµµright
|
| 5 |
+
cca530fc-4052-43b2-b130-b30968d8aa44µReview the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.µ1µcca530fc-4052-43b2-b130-b30968d8aa44.pngµhttps://agents-course-unit4-scoring.hf.space/files/cca530fc-4052-43b2-b130-b30968d8aa44µRd5
|
| 6 |
+
4fc2f1ae-8625-45b5-ab34-ad4433bc21f8µWho nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?µ1µµµFunkMonk
|
| 7 |
+
6f37996b-2ac7-44b0-8e68-6d28256631b4µ"Given this table defining * on the set S = {a, b, c, d, e}
|
| 8 |
+
|
| 9 |
+
|*|a|b|c|d|e|
|
| 10 |
+
|---|---|---|---|---|---|
|
| 11 |
+
|a|a|b|c|b|d|
|
| 12 |
+
|b|b|c|a|e|c|
|
| 13 |
+
|c|c|a|b|b|a|
|
| 14 |
+
|d|b|e|b|e|d|
|
| 15 |
+
|e|d|b|a|d|c|
|
| 16 |
+
|
| 17 |
+
provide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order."µ1µµµb,e
|
| 18 |
+
9d191bce-651d-4746-be2d-7ef8ecadb9c2µ"Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.
|
| 19 |
+
|
| 20 |
+
What does Teal'c say in response to the question ""Isn't that hot?"""µ1µµµExtremely
|
| 21 |
+
cabe07ed-9eca-40ea-8ead-410ef5e83f91µWhat is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?µ1µµµLouvrier
|
| 22 |
+
3cef3a44-215e-4aed-8e3b-b1e3f08063b7µ"I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:
|
| 23 |
+
|
| 24 |
+
milk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts
|
| 25 |
+
|
| 26 |
+
I need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list."µ1µµµbroccoli, celery, fresh basil, lettuce, sweet potatoes
|
| 27 |
+
99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3µ"Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.
|
| 28 |
+
|
| 29 |
+
In your response, please only list the ingredients, not any measurements. So if the recipe calls for ""a pinch of salt"" or ""two cups of ripe strawberries"" the ingredients on the list would be ""salt"" and ""ripe strawberries"".
|
| 30 |
+
|
| 31 |
+
Please format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients."µ1µ99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3µhttps://agents-course-unit4-scoring.hf.space/files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3µcornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries
|
| 32 |
+
305ac316-eef6-4446-960a-92d80d542f82µWho did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.µ1µµµWojciech
|
| 33 |
+
f918266a-b3e0-4914-865d-4faa564f1aefµWhat is the final numeric output from the attached Python code?µ1µf918266a-b3e0-4914-865d-4faa564f1aef.pyµhttps://agents-course-unit4-scoring.hf.space/files/f918266a-b3e0-4914-865d-4faa564f1aefµ0
|
| 34 |
+
3f57289b-8c60-48be-bd80-01f8099ca449µHow many at bats did the Yankee with the most walks in the 1977 regular season have that same season?µ1µµµ519
|
| 35 |
+
1f975693-876d-457b-a649-393859e79bf3µ"Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(
|
| 36 |
+
|
| 37 |
+
Could you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order."µ1µ1f975693-876d-457b-a649-393859e79bf3.mp3µhttps://agents-course-unit4-scoring.hf.space/files/1f975693-876d-457b-a649-393859e79bf3µ132, 133, 134, 197, 245
|
| 38 |
+
840bfca7-4f7b-481a-8794-c560c340185dµOn June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?µ1µµµ80GSFC21M0002
|
| 39 |
+
bda648d7-d618-4883-88f4-3466eabd860eµWhere were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.µ1µµµSaint Petersburg
|
| 40 |
+
cf106601-ab4f-4af9-b045-5295fe67b37dµWhat country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.µ1µµµCUB
|
| 41 |
+
a0c07678-e491-4bbc-8f0b-07405144218fµWho are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.µ1µµµYoshida, Uehara
|
| 42 |
+
7bd855d8-463d-4ed5-93ca-5fe35145f733µThe attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.µ1µ7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsxµhttps://agents-course-unit4-scoring.hf.space/files/7bd855d8-463d-4ed5-93ca-5fe35145f733µ89706.00
|
| 43 |
+
5a0c1adf-205e-4841-a666-7c3ef95def9dµWhat is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?µ1µµµClaus
|
data/lib.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*GAIA is a benchmark which aims at evaluating next-generation LLMs (LLMs with augmented capabilities due to added tooling, efficient prompting, access to search, etc).*
|
| 2 |
+
***Data***
|
| 3 |
+
*GAIA is made of more than 450 non-trivial question with an unambiguous answer, requiring different levels of tooling and autonomy to solve. It is therefore divided in 3 levels, where level 1 should be breakable by very good LLMs, and level 3 indicate a strong jump in model capabilities. Each level is divided into a fully public dev set for validation, and a test set with private answers and metadata.*
|
thumbnail.jpg
ADDED
|
|
Git LFS Details
|