Spaces:

Loren
/

GAIA_Agents_Evaluations

Sleeping

App Files Files Community

GAIA_Agents_Evaluations / app.py

Loren

Update app.py

bc072c3 verified 3 months ago

raw

history blame contribute delete

15.3 kB

	import streamlit as st
	import pandas as pd
	import re
	import ast
	import io
	import os
	from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
	from pathlib import Path
	import uuid
	import warnings
	warnings.filterwarnings("ignore")

	####################################################################
	### FUNCTIONS ###
	####################################################################

	@st.cache_data(show_spinner=True)
	def initializations():
	st.session_state.question = ""
	st.session_state.file_dataset = "./data/gaia_subset.csv"
	st.session_state.file_evaluations = "./data/gaia_evals.csv"
	st.session_state.gaia = True
	st.session_state.file_lib = "./data/lib.md"
	st.session_state.file_sidebar = "./data/gaia_sidebar.txt"
	st.session_state.dfk = str(uuid.uuid4())
	#

	@st.cache_data(show_spinner=True)
	def get_dataset(dataset_file):
	return pd.read_csv(dataset_file, sep='µ', engine='python')
	#

	@st.cache_data(show_spinner=True)
	def get_evaluations(eval_file):
	def set_eval(answer1, answer2):
	answer1 = re.sub(r'\.$', '', answer1.lower()).replace(', ', ',')
	answer2 = re.sub(r'\.$', '', answer2.lower()).replace(', ', ',')
	return answer1 == answer2

	df = pd.read_csv(eval_file, sep='µ', engine='python')
	df = df.merge(st.session_state.df_dataset[['task_id', 'question', 'file_url', 'answer']],
	on='task_id', how='left')
	list_labels = pd.unique(df['label'])
	list_questions = pd.unique(df['question'])
	df['eval'] = df.apply(lambda r: set_eval(str(r['submitted_answer']),
	str(r['answer'])), axis=1)
	df_pivot = df.pivot(index=['task_id','question'], columns='label',
	values=['eval','submitted_answer','messages'])
	df_reset = df_pivot.reindex(columns=list_labels, level=1).reset_index()
	df_reset['question'] = pd.Categorical(df_reset['question'],
	categories=list_questions, ordered=True)
	df_eval = df_reset.sort_values('question')

	df_synth = df.pivot(index='question', columns='label', values='eval') \
	.reindex(columns=list_labels) \
	.reindex(pd.unique(df_eval['question']))

	totaux = df_synth.sum(axis=0)

	df_perf = totaux.reset_index().T
	df_perf.columns = df_perf.iloc[0]
	df_perf = df_perf.iloc[1:]
	df_perf.loc["Nb correct"] = totaux
	df_perf.loc["% correct"] = totaux *100 / len(df_eval)
	df_perf = df_perf.iloc[1:]

	return df_eval, df_synth, df_perf, list_labels
	#

	@st.cache_data(show_spinner=True)
	def get_lib(lib_file):
	lib = ''
	if isinstance(lib_file, str):
	lib = Path(lib_file).read_text(encoding="utf-8")
	else:
	lib = lib_file.read().decode("utf-8")
	return lib
	#

	@st.cache_data(show_spinner=True)
	def get_sidebar(sidebar_file):
	if isinstance(sidebar_file, str):
	with open(sidebar_file, "r", encoding="utf-8") as f:
	lignes = f.readlines()
	else:
	stringio = io.StringIO(sidebar_file.read().decode("utf-8"))
	lignes = stringio.readlines()

	return lignes
	#

	def parse_messages_from_string(messages_str):
	messages = []
	status = True
	try:
	messages_match = re.search(r"'messages': \[(.*)\]", messages_str, re.DOTALL)
	messages_content = messages_match.group(1)
	message_splits = re.findall(r'(HumanMessage$.?$\|AIMessage$.?$\|ToolMessage$.*?$)(?=, HumanMessage\(\|, AIMessage\(\|, ToolMessage\(\|$)', messages_content, re.DOTALL)

	for msg_str in message_splits:
	# Identifier le type de message
	if msg_str.startswith('HumanMessage'):
	msg_type = 'HumanMessage'
	elif msg_str.startswith('AIMessage'):
	msg_type = 'AIMessage'
	elif msg_str.startswith('ToolMessage'):
	msg_type = 'ToolMessage'
	else:
	continue # Type inconnu, passer au suivant

	# Extraire les arguments du constructeur
	args_str = msg_str[len(msg_type)+1:-1] # Supprimer 'TypeMessage(' et ')'
	# Convertir les arguments en dictionnaire
	# Remplacer les paires clé=valeur par des paires 'clé': valeur
	args_str = re.sub(r'(\w+)=', r'"\1":', args_str)
	try:
	args = ast.literal_eval('{' + args_str + '}')
	# Créer l'objet de message approprié
	if msg_type == 'HumanMessage':
	message = HumanMessage(**args)
	elif msg_type == 'AIMessage':
	message = AIMessage(**args)
	elif msg_type == 'ToolMessage':
	message = ToolMessage(**args)
	else:
	continue
	messages.append(message)
	except Exception as e:
	message = HumanMessage(f"*** Error parsing message: {e}")
	messages.append(message)
	message = HumanMessage(f"*** See the original list of messages below")
	messages.append(message)
	status = False
	print(f"Error parsing message: {e}")
	continue
	except Exception as e:
	print(f"Erreur lors de l'analyse du messageparse_message_from_string: {e}")
	finally:
	return messages, status
	#

	def get_details():
	dfkey = st.session_state.dfk
	if len(st.session_state[dfkey]) > 0:
	if len(st.session_state[dfkey]["selection"]["rows"]):
	num_raw = st.session_state[dfkey]["selection"]["rows"][0]
	df_eval = st.session_state.df_eval
	st.session_state.question = df_eval.iloc[num_raw].question.squeeze()
	for i in range(0, len(st.session_state.list_labels)):
	with list_tabs[i].chat_message("ai"):
	if df_eval.iloc[num_raw].eval[i]:
	st.markdown(str(df_eval.iloc[num_raw].submitted_answer[i])+" "+
	":green-badge[:material/check: Correct]")
	else:
	st.markdown(str(df_eval.iloc[num_raw].submitted_answer[i]) + " " +
	":orange-badge[⚠️ Needs review]")
	messages, status = parse_messages_from_string(df_eval.iloc[num_raw].messages[i])
	c = st.container(border=True)
	c.markdown("### Message history:")
	c.text("\n".join(m.pretty_repr() for m in messages))
	if not status:
	c.text(df_eval.iloc[num_raw].messages[i])
	#print("\n".join(m.pretty_repr() for m in messages))
	#

	def save_uploaded_file(uploaded_file, folder="data"):
	os.makedirs(folder, exist_ok=True)
	save_path = os.path.join(folder, uploaded_file.name)
	with open(save_path, "wb") as f:
	f.write(uploaded_file.getbuffer())
	return save_path
	#
	####################################################################
	### MAIN ###
	####################################################################

	#--- Initializations
	st.set_page_config(page_title='Agents evaluation',layout="wide",
	initial_sidebar_state="auto")
	initializations()
	if 'question' not in st.session_state:
	st.session_state.question = ""
	if 'file_dataset' not in st.session_state:
	st.session_state.file_dataset = "./data/gaia_subset.csv"
	if 'file_evaluations' not in st.session_state:
	st.session_state.file_evaluations = "./data/gaia_evals.csv"
	if 'gaia' not in st.session_state:
	st.session_state.gaia = True
	if 'file_lib' not in st.session_state:
	st.session_state.file_lib = "./data/lib.md"
	if 'file_sidebar' not in st.session_state:
	st.session_state.file_sidebar = "./data/gaia_sidebar.txt"
	if 'dfk' not in st.session_state:
	st.session_state.dfk = str(uuid.uuid4())

	#--- Set title
	if st.session_state.gaia:
	col1, col2 = st.columns([0.4, 0.6], vertical_alignment="center")
	col1.image("thumbnail.jpg")
	col2.markdown("<h1 style='text-align: center; color: orange;'>GAIA subset evaluation</h1>",
	unsafe_allow_html=True)
	col1.link_button(":blue[More information]",
	"https://huggingface.co/learn/agents-course/unit4/introduction")
	pop = col2.container()
	upd = col2.expander(":red[Upload files to update app]")
	else:
	st.markdown("<h1 style='text-align: center; color: orange;'>Agents evaluation</h1>",
	unsafe_allow_html=True)
	pop = st.container()
	upd = st.expander(":red[Upload files to update app]")


	#--- Popover
	with pop.popover("### 💡 :red[How to configure the app to use it with a different evaluation?]",
	use_container_width=True):
	st.markdown("""You can modify the data the application is based on by uploading your own files, respecting the expected formats: \n
	The test dataset must be a csv file with the µ separator character. The header line must contain the expected fields: \n
	>*task id, question, file name, file url ,answer.* \n
	>task_id, question, file_name, file_url, answer \n
	Example of test dataset:""")
	st.code("""task_idµquestionµfile_nameµfile_urlµanswer \n
	2d83110e-a098-4ebb-9987-066c06fa42d0µ".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"µµµright \n
	""", language=None)
	st.markdown("___")
	st.markdown("""The evaluation dataset must also be a csv file with the µ separator character. The header line must contain the expected fields: \n
	>*label of the agent, task id, agent's response, message history (a string formatted as a list of HumanMessage, AIMessage, ToolMessage from Langchain).* \n
	>label, task_id, submitted_answer, messages \n
	Example of evaluation dataset:""")
	st.code("""labelµtask_idµsubmitted_answerµmessages
	Qwen2.5-72B-Instructµ2d83110e-a098-4ebb-9987-066c06fa42d0µrightµ"{'messages': [HumanMessage(content='.rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI', additional_kwargs={}, response_metadata={}, id='98460ac1-f0c0-41dc-8f32-ddf50b123a71'), AIMessage(content='The user wrote a sentence in reverse. ... There\'s no need for any tools here because this is a basic vocabulary question. ... Therefore, the final answer is ""right.""\n</think>\n\nFINAL ANSWER: right', additional_kwargs={}, response_metadata={...}, 'model_name': 'Qwen/Qwen3-235B-A22B', ...}, ..."
	""", language=None)
	st.markdown("___")
	st.markdown("""You can also set your title and your sidebar by uploading appropriate files: \n
	* a md or txt file for the title. \n
	Example:""")
	st.code("""GAIA is a benchmark which aims at ...
	*Data*
	GAIA is made of more than 450 non-trivial question with an unambiguous answer, ...
	""", language=None)
	st.markdown("""* a text file describing, in markdown, the section titles and tool descriptions. \n
	Example:""")
	st.code("""title;:orange[Langchain tools]
	tool;:material/language: TavilySearch
	tool;:material/newsstand: WikipediaQueryRun
	title;:orange[Custom tools]
	tool;:material/slideshow: Ask Youtube video
	tool;:material/chess: Chessboard description
	tool;:material/speech_to_text: Audio transcription
	tool;:material/text_snippet: Get file content
	tool;:material/add: Sum numbers
	""", language=None)

	#--- Update app configuration
	with upd.form(":red[Update app]"):
	uploaded_dataset = st.file_uploader("Choose the dataset file:", type='csv')
	uploaded_evaluations = st.file_uploader("Choose the evaluations file:", type='csv')
	uploaded_lib = st.file_uploader("Choose the file with the dataset description:", type=['md', 'txt'])
	uploaded_sidebar = st.file_uploader("Choose the file with the sidebar description:", type=['md', 'txt'])
	valid = st.form_submit_button("🚀 :red[Update app]")
	if valid:
	if uploaded_lib is not None:
	st.session_state.gaia = False
	st.session_state.file_lib = uploaded_lib
	if uploaded_dataset is not None:
	st.session_state.file_dataset = uploaded_dataset
	st.session_state.question = ""
	if uploaded_evaluations is not None:
	st.session_state.file_evaluations = save_uploaded_file(uploaded_evaluations)
	print('fichier sauvegardé : ', st.session_state.file_evaluations)
	st.session_state.dfk = str(uuid.uuid4())
	st.session_state.question = ""
	if 'list_tabs' in locals():
	del list_tabs
	if uploaded_sidebar is not None:
	st.session_state.file_sidebar = uploaded_sidebar


	#--- Get dataset information
	try:
	st.session_state.lib = get_lib(st.session_state.file_lib)
	except Exception as e:
	st.exception(f'Error during get_lib: {e}')

	#--- Get sidebar description
	try:
	st.session_state.lignes = get_sidebar(st.session_state.file_sidebar)
	except Exception as e:
	st.exception(f'Error during get_sidebar: {e}')

	#--- Set sidebar
	try:
	with st.sidebar:
	st.markdown("# :material/construction: Tools used")
	for ligne in st.session_state.lignes:
	lig = ligne.split(";")
	if lig[0] == 'title':
	st.markdown("## "+lig[1])
	if lig[0] == 'tool':
	with st.container(border=True):
	st.markdown("### "+lig[1])
	except Exception as e:
	st.exception(f'Error during set sidebar: {e}')

	#--- Get dataset
	try:
	st.session_state.df_dataset = get_dataset(st.session_state.file_dataset)
	except Exception as e:
	st.exception(f'Error during get_dataset: {e}')

	#--- Get evaluations
	try:
	st.session_state.df_eval, st.session_state.df_synth, st.session_state.df_perf, \
	st.session_state.list_labels = get_evaluations(st.session_state.file_evaluations)
	except Exception as e:
	st.exception(f'Error during get_evaluations: {e}')


	#--- Show dataset expander
	with st.expander("## :orange[Dataset informations]", expanded=False):
	try:
	st.markdown(">"+st.session_state.lib)
	st.markdown("#### Test dataset:")
	st.dataframe(st.session_state.df_dataset[['question', 'file_url']],
	column_config={"file_url": st.column_config.LinkColumn("Attached file",
	display_text="Download attached file"),
	"question": st.column_config.TextColumn(max_chars=None)})
	except Exception as e:
	st.exception(f'Error in dataset informations: {e}')

	#--- Show perf dataframe
	st.dataframe(st.session_state.df_perf)

	#--- Show evaluations synthesys
	st.markdown("👇 Click to the left of the question to obtain details of the different model evaluations")
	st.dataframe(st.session_state.df_synth, on_select=get_details, key=st.session_state.dfk,
	selection_mode="single-row")

	#--- Details container
	cont = st.container()

	with cont.chat_message('user'):
	st.markdown(f'###### :blue[{st.session_state.question}]')

	cols = [''.join(col).strip() for col in st.session_state.list_labels]

	list_tabs = cont.tabs(cols)