Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import re | |
import ast | |
import io | |
import os | |
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage | |
from pathlib import Path | |
import uuid | |
import warnings | |
warnings.filterwarnings("ignore") | |
#################################################################### | |
### FUNCTIONS ### | |
#################################################################### | |
def initializations(): | |
st.session_state.question = "" | |
st.session_state.file_dataset = "./data/gaia_subset.csv" | |
st.session_state.file_evaluations = "./data/gaia_evals.csv" | |
st.session_state.gaia = True | |
st.session_state.file_lib = "./data/lib.md" | |
st.session_state.file_sidebar = "./data/gaia_sidebar.txt" | |
st.session_state.dfk = str(uuid.uuid4()) | |
# | |
def get_dataset(dataset_file): | |
return pd.read_csv(dataset_file, sep='µ', engine='python') | |
# | |
def get_evaluations(eval_file): | |
def set_eval(answer1, answer2): | |
answer1 = re.sub(r'\.$', '', answer1.lower()).replace(', ', ',') | |
answer2 = re.sub(r'\.$', '', answer2.lower()).replace(', ', ',') | |
return answer1 == answer2 | |
df = pd.read_csv(eval_file, sep='µ', engine='python') | |
df = df.merge(st.session_state.df_dataset[['task_id', 'question', 'file_url', 'answer']], | |
on='task_id', how='left') | |
list_labels = pd.unique(df['label']) | |
list_questions = pd.unique(df['question']) | |
df['eval'] = df.apply(lambda r: set_eval(str(r['submitted_answer']), | |
str(r['answer'])), axis=1) | |
df_pivot = df.pivot(index=['task_id','question'], columns='label', | |
values=['eval','submitted_answer','messages']) | |
df_reset = df_pivot.reindex(columns=list_labels, level=1).reset_index() | |
df_reset['question'] = pd.Categorical(df_reset['question'], | |
categories=list_questions, ordered=True) | |
df_eval = df_reset.sort_values('question') | |
df_synth = df.pivot(index='question', columns='label', values='eval') \ | |
.reindex(columns=list_labels) \ | |
.reindex(pd.unique(df_eval['question'])) | |
totaux = df_synth.sum(axis=0) | |
df_perf = totaux.reset_index().T | |
df_perf.columns = df_perf.iloc[0] | |
df_perf = df_perf.iloc[1:] | |
df_perf.loc["Nb correct"] = totaux | |
df_perf.loc["% correct"] = totaux *100 / len(df_eval) | |
df_perf = df_perf.iloc[1:] | |
return df_eval, df_synth, df_perf, list_labels | |
# | |
def get_lib(lib_file): | |
lib = '' | |
if isinstance(lib_file, str): | |
lib = Path(lib_file).read_text(encoding="utf-8") | |
else: | |
lib = lib_file.read().decode("utf-8") | |
return lib | |
# | |
def get_sidebar(sidebar_file): | |
if isinstance(sidebar_file, str): | |
with open(sidebar_file, "r", encoding="utf-8") as f: | |
lignes = f.readlines() | |
else: | |
stringio = io.StringIO(sidebar_file.read().decode("utf-8")) | |
lignes = stringio.readlines() | |
return lignes | |
# | |
def parse_messages_from_string(messages_str): | |
messages = [] | |
status = True | |
try: | |
messages_match = re.search(r"'messages': \[(.*)\]", messages_str, re.DOTALL) | |
messages_content = messages_match.group(1) | |
message_splits = re.findall(r'(HumanMessage\(.*?\)|AIMessage\(.*?\)|ToolMessage\(.*?\))(?=, HumanMessage\(|, AIMessage\(|, ToolMessage\(|$)', messages_content, re.DOTALL) | |
for msg_str in message_splits: | |
# Identifier le type de message | |
if msg_str.startswith('HumanMessage'): | |
msg_type = 'HumanMessage' | |
elif msg_str.startswith('AIMessage'): | |
msg_type = 'AIMessage' | |
elif msg_str.startswith('ToolMessage'): | |
msg_type = 'ToolMessage' | |
else: | |
continue # Type inconnu, passer au suivant | |
# Extraire les arguments du constructeur | |
args_str = msg_str[len(msg_type)+1:-1] # Supprimer 'TypeMessage(' et ')' | |
# Convertir les arguments en dictionnaire | |
# Remplacer les paires clé=valeur par des paires 'clé': valeur | |
args_str = re.sub(r'(\w+)=', r'"\1":', args_str) | |
try: | |
args = ast.literal_eval('{' + args_str + '}') | |
# Créer l'objet de message approprié | |
if msg_type == 'HumanMessage': | |
message = HumanMessage(**args) | |
elif msg_type == 'AIMessage': | |
message = AIMessage(**args) | |
elif msg_type == 'ToolMessage': | |
message = ToolMessage(**args) | |
else: | |
continue | |
messages.append(message) | |
except Exception as e: | |
message = HumanMessage(f"*** Error parsing message: {e}") | |
messages.append(message) | |
message = HumanMessage(f"*** See the original list of messages below") | |
messages.append(message) | |
status = False | |
print(f"Error parsing message: {e}") | |
continue | |
except Exception as e: | |
print(f"Erreur lors de l'analyse du messageparse_message_from_string: {e}") | |
finally: | |
return messages, status | |
# | |
def get_details(): | |
dfkey = st.session_state.dfk | |
if len(st.session_state[dfkey]) > 0: | |
if len(st.session_state[dfkey]["selection"]["rows"]): | |
num_raw = st.session_state[dfkey]["selection"]["rows"][0] | |
df_eval = st.session_state.df_eval | |
st.session_state.question = df_eval.iloc[num_raw].question.squeeze() | |
for i in range(0, len(st.session_state.list_labels)): | |
with list_tabs[i].chat_message("ai"): | |
if df_eval.iloc[num_raw].eval[i]: | |
st.markdown(str(df_eval.iloc[num_raw].submitted_answer[i])+" "+ | |
":green-badge[:material/check: Correct]") | |
else: | |
st.markdown(str(df_eval.iloc[num_raw].submitted_answer[i]) + " " + | |
":orange-badge[⚠️ Needs review]") | |
messages, status = parse_messages_from_string(df_eval.iloc[num_raw].messages[i]) | |
c = st.container(border=True) | |
c.markdown("### Message history:") | |
c.text("\n".join(m.pretty_repr() for m in messages)) | |
if not status: | |
c.text(df_eval.iloc[num_raw].messages[i]) | |
#print("\n".join(m.pretty_repr() for m in messages)) | |
# | |
def save_uploaded_file(uploaded_file, folder="data"): | |
os.makedirs(folder, exist_ok=True) | |
save_path = os.path.join(folder, uploaded_file.name) | |
with open(save_path, "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
return save_path | |
# | |
#################################################################### | |
### MAIN ### | |
#################################################################### | |
#--- Initializations | |
st.set_page_config(page_title='Agents evaluation',layout="wide", | |
initial_sidebar_state="auto") | |
initializations() | |
if 'question' not in st.session_state: | |
st.session_state.question = "" | |
if 'file_dataset' not in st.session_state: | |
st.session_state.file_dataset = "./data/gaia_subset.csv" | |
if 'file_evaluations' not in st.session_state: | |
st.session_state.file_evaluations = "./data/gaia_evals.csv" | |
if 'gaia' not in st.session_state: | |
st.session_state.gaia = True | |
if 'file_lib' not in st.session_state: | |
st.session_state.file_lib = "./data/lib.md" | |
if 'file_sidebar' not in st.session_state: | |
st.session_state.file_sidebar = "./data/gaia_sidebar.txt" | |
if 'dfk' not in st.session_state: | |
st.session_state.dfk = str(uuid.uuid4()) | |
#--- Set title | |
if st.session_state.gaia: | |
col1, col2 = st.columns([0.4, 0.6], vertical_alignment="center") | |
col1.image("thumbnail.jpg") | |
col2.markdown("<h1 style='text-align: center; color: orange;'>GAIA subset evaluation</h1>", | |
unsafe_allow_html=True) | |
col1.link_button(":blue[More information]", | |
"https://huggingface.co/learn/agents-course/unit4/introduction") | |
pop = col2.container() | |
upd = col2.expander(":red[**Upload files to update app**]") | |
else: | |
st.markdown("<h1 style='text-align: center; color: orange;'>Agents evaluation</h1>", | |
unsafe_allow_html=True) | |
pop = st.container() | |
upd = st.expander(":red[**Upload files to update app**]") | |
#--- Popover | |
with pop.popover("### 💡 :red[**How to configure the app to use it with a different evaluation?**]", | |
use_container_width=True): | |
st.markdown("""You can modify the data the application is based on by **uploading** your own files, respecting the expected **formats**: \n | |
The **test dataset** must be a csv file with the **µ** separator character. The header line must contain the expected **fields**: \n | |
>***task id, question, file name, file url ,answer.*** \n | |
>*task_id, question, file_name, file_url, answer* \n | |
*Example of test dataset:*""") | |
st.code("""task_idµquestionµfile_nameµfile_urlµanswer \n | |
2d83110e-a098-4ebb-9987-066c06fa42d0µ".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"µµµright \n | |
""", language=None) | |
st.markdown("___") | |
st.markdown("""The **evaluation dataset** must also be a csv file with the **µ** separator character. The header line must contain the expected **fields**: \n | |
>***label of the agent, task id, agent's response, message history (a string formatted as a list of HumanMessage, AIMessage, ToolMessage from Langchain).*** \n | |
>*label, task_id, submitted_answer, messages* \n | |
*Example of evaluation dataset:*""") | |
st.code("""labelµtask_idµsubmitted_answerµmessages | |
Qwen2.5-72B-Instructµ2d83110e-a098-4ebb-9987-066c06fa42d0µrightµ"{'messages': [HumanMessage(content='.rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI', additional_kwargs={}, response_metadata={}, id='98460ac1-f0c0-41dc-8f32-ddf50b123a71'), AIMessage(content='The user wrote a sentence in reverse. ... There\'s no need for any tools here because this is a basic vocabulary question. ... Therefore, the final answer is ""right.""\n</think>\n\nFINAL ANSWER: right', additional_kwargs={}, response_metadata={...}, 'model_name': 'Qwen/Qwen3-235B-A22B', ...}, ..." | |
""", language=None) | |
st.markdown("___") | |
st.markdown("""You can also set your **title** and your **sidebar** by **uploading** appropriate files: \n | |
* a md or txt file for the title. \n | |
*Example:*""") | |
st.code("""*GAIA is a benchmark which aims at ...* | |
***Data*** | |
*GAIA is made of more than 450 non-trivial question with an unambiguous answer, ...* | |
""", language=None) | |
st.markdown("""* a text file describing, in markdown, the section titles and tool descriptions. \n | |
*Example:*""") | |
st.code("""title;:orange[Langchain tools] | |
tool;:material/language: TavilySearch | |
tool;:material/newsstand: WikipediaQueryRun | |
title;:orange[Custom tools] | |
tool;:material/slideshow: Ask Youtube video | |
tool;:material/chess: Chessboard description | |
tool;:material/speech_to_text: Audio transcription | |
tool;:material/text_snippet: Get file content | |
tool;:material/add: Sum numbers | |
""", language=None) | |
#--- Update app configuration | |
with upd.form(":red[**Update app**]"): | |
uploaded_dataset = st.file_uploader("Choose the **dataset** file:", type='csv') | |
uploaded_evaluations = st.file_uploader("Choose the **evaluation**s file:", type='csv') | |
uploaded_lib = st.file_uploader("Choose the file with the dataset **description**:", type=['md', 'txt']) | |
uploaded_sidebar = st.file_uploader("Choose the file with the **sidebar** description:", type=['md', 'txt']) | |
valid = st.form_submit_button("🚀 :red[**Update app**]") | |
if valid: | |
if uploaded_lib is not None: | |
st.session_state.gaia = False | |
st.session_state.file_lib = uploaded_lib | |
if uploaded_dataset is not None: | |
st.session_state.file_dataset = uploaded_dataset | |
st.session_state.question = "" | |
if uploaded_evaluations is not None: | |
st.session_state.file_evaluations = save_uploaded_file(uploaded_evaluations) | |
print('fichier sauvegardé : ', st.session_state.file_evaluations) | |
st.session_state.dfk = str(uuid.uuid4()) | |
st.session_state.question = "" | |
if 'list_tabs' in locals(): | |
del list_tabs | |
if uploaded_sidebar is not None: | |
st.session_state.file_sidebar = uploaded_sidebar | |
#--- Get dataset information | |
try: | |
st.session_state.lib = get_lib(st.session_state.file_lib) | |
except Exception as e: | |
st.exception(f'Error during get_lib: {e}') | |
#--- Get sidebar description | |
try: | |
st.session_state.lignes = get_sidebar(st.session_state.file_sidebar) | |
except Exception as e: | |
st.exception(f'Error during get_sidebar: {e}') | |
#--- Set sidebar | |
try: | |
with st.sidebar: | |
st.markdown("# :material/construction: Tools used") | |
for ligne in st.session_state.lignes: | |
lig = ligne.split(";") | |
if lig[0] == 'title': | |
st.markdown("## "+lig[1]) | |
if lig[0] == 'tool': | |
with st.container(border=True): | |
st.markdown("### "+lig[1]) | |
except Exception as e: | |
st.exception(f'Error during set sidebar: {e}') | |
#--- Get dataset | |
try: | |
st.session_state.df_dataset = get_dataset(st.session_state.file_dataset) | |
except Exception as e: | |
st.exception(f'Error during get_dataset: {e}') | |
#--- Get evaluations | |
try: | |
st.session_state.df_eval, st.session_state.df_synth, st.session_state.df_perf, \ | |
st.session_state.list_labels = get_evaluations(st.session_state.file_evaluations) | |
except Exception as e: | |
st.exception(f'Error during get_evaluations: {e}') | |
#--- Show dataset expander | |
with st.expander("## **:orange[Dataset informations]**", expanded=False): | |
try: | |
st.markdown(">"+st.session_state.lib) | |
st.markdown("#### Test dataset:") | |
st.dataframe(st.session_state.df_dataset[['question', 'file_url']], | |
column_config={"file_url": st.column_config.LinkColumn("Attached file", | |
display_text="Download attached file"), | |
"question": st.column_config.TextColumn(max_chars=None)}) | |
except Exception as e: | |
st.exception(f'Error in dataset informations: {e}') | |
#--- Show perf dataframe | |
st.dataframe(st.session_state.df_perf) | |
#--- Show evaluations synthesys | |
st.markdown("👇 Click to the left of the question to obtain details of the different model evaluations") | |
st.dataframe(st.session_state.df_synth, on_select=get_details, key=st.session_state.dfk, | |
selection_mode="single-row") | |
#--- Details container | |
cont = st.container() | |
with cont.chat_message('user'): | |
st.markdown(f'###### :blue[{st.session_state.question}]') | |
cols = [''.join(col).strip() for col in st.session_state.list_labels] | |
list_tabs = cont.tabs(cols) |