Spaces:

MBZUAI-LLM
/

Mobile-MMLU-Challenge

Sleeping

App Files Files Community

Mobile-MMLU-Challenge / app.py

SondosMB

Update app.py

d6777ef verified 11 months ago

raw

history blame

17.6 kB

	import gradio as gr
	import pandas as pd
	import os
	import re
	from datetime import datetime
	from huggingface_hub import hf_hub_download
	from huggingface_hub import HfApi, HfFolder

	LEADERBOARD_FILE = "leaderboard.csv"
	GROUND_TRUTH_FILE = "ground_truth.csv"
	LAST_UPDATED = datetime.now().strftime("%B %d, %Y")

	# Ensure authentication and suppress warnings
	os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
	HF_TOKEN = os.getenv("HF_TOKEN")
	if not HF_TOKEN:
	raise ValueError("HF_TOKEN environment variable is not set or invalid.")

	def initialize_leaderboard_file():
	"""
	Ensure the leaderboard file exists and has the correct headers.
	"""
	if not os.path.exists(LEADERBOARD_FILE):
	pd.DataFrame(columns=[
	"Model Name", "Overall Accuracy", "Valid Accuracy",
	"Correct Predictions", "Total Questions", "Timestamp"
	]).to_csv(LEADERBOARD_FILE, index=False)
	elif os.stat(LEADERBOARD_FILE).st_size == 0:
	pd.DataFrame(columns=[
	"Model Name", "Overall Accuracy", "Valid Accuracy",
	"Correct Predictions", "Total Questions", "Timestamp"
	]).to_csv(LEADERBOARD_FILE, index=False)

	def clean_answer(answer):
	if pd.isna(answer):
	return None
	answer = str(answer)
	clean = re.sub(r'[^A-Da-d]', '', answer)
	return clean[0].upper() if clean else None


	def update_leaderboard(results):
	"""
	Append new submission results to the leaderboard file and push updates to the Hugging Face repository.
	"""
	new_entry = {
	"Model Name": results['model_name'],
	"Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
	"Valid Accuracy": round(results['valid_accuracy'] * 100, 2),
	"Correct Predictions": results['correct_predictions'],
	"Total Questions": results['total_questions'],
	"Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	}

	try:
	# Update the local leaderboard file
	new_entry_df = pd.DataFrame([new_entry])
	file_exists = os.path.exists(LEADERBOARD_FILE)

	new_entry_df.to_csv(
	LEADERBOARD_FILE,
	mode='a', # Append mode
	index=False,
	header=not file_exists # Write header only if the file is new
	)
	print(f"Leaderboard updated successfully at {LEADERBOARD_FILE}")

	# Push the updated file to the Hugging Face repository using HTTP API
	api = HfApi()
	token = HfFolder.get_token()

	api.upload_file(
	path_or_fileobj=LEADERBOARD_FILE,
	path_in_repo="leaderboard.csv",
	repo_id="SondosMB/ss", # Your Space repository
	repo_type="space",
	token=token
	)
	print("Leaderboard changes pushed to Hugging Face repository.")

	except Exception as e:
	print(f"Error updating leaderboard file: {e}")



	def load_leaderboard():
	if not os.path.exists(LEADERBOARD_FILE) or os.stat(LEADERBOARD_FILE).st_size == 0:
	return pd.DataFrame({
	"Model Name": [],
	"Overall Accuracy": [],
	"Valid Accuracy": [],
	"Correct Predictions": [],
	"Total Questions": [],
	"Timestamp": [],
	})
	return pd.read_csv(LEADERBOARD_FILE)

	def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
	try:
	ground_truth_path = hf_hub_download(
	repo_id="SondosMB/ground-truth-dataset",
	filename="ground_truth.csv",
	repo_type="dataset",
	use_auth_token=True
	)
	ground_truth_df = pd.read_csv(ground_truth_path)
	except FileNotFoundError:
	return "Ground truth file not found in the dataset repository.", load_leaderboard()
	except Exception as e:
	return f"Error loading ground truth: {e}", load_leaderboard()

	if not prediction_file:
	return "Prediction file not uploaded.", load_leaderboard()

	try:
	#load predition file
	predictions_df = pd.read_csv(prediction_file.name)
	# Validate required columns in prediction file
	required_columns = ['question_id', 'predicted_answer']
	missing_columns = [col for col in required_columns if col not in predictions_df.columns]
	if missing_columns:
	return (f"Error: Missing required columns in prediction file: {', '.join(missing_columns)}.",
	load_leaderboard())

	# Validate 'Answer' column in ground truth file
	if 'Answer' not in ground_truth_df.columns:
	return "Error: 'Answer' column is missing in the ground truth dataset.", load_leaderboard()
	merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
	merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)

	valid_predictions = merged_df.dropna(subset=['pred_answer'])
	correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
	total_predictions = len(merged_df)
	total_valid_predictions = len(valid_predictions)

	overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
	valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0

	results = {
	'model_name': model_name if model_name else "Unknown Model",
	'overall_accuracy': overall_accuracy,
	'valid_accuracy': valid_accuracy,
	'correct_predictions': correct_predictions,
	'total_questions': total_predictions,
	}

	if add_to_leaderboard:
	update_leaderboard(results)
	return "Evaluation completed and added to leaderboard.", load_leaderboard()
	else:
	return "Evaluation completed but not added to leaderboard.", load_leaderboard()

	except Exception as e:
	return f"Error during evaluation: {str(e)}", load_leaderboard()

	initialize_leaderboard_file()

	# Function to set default mode
	# Function to set default mode
	import gradio as gr

	# # Custom CSS to match website style
	# # Define CSS to match a modern, professional design
	# # Define enhanced CSS for the entire layout
	css_tech_theme = """
	body {
	font-family: 'Roboto', sans-serif;
	background-color: #f4f6fa;
	color: #333333;
	margin: 0;
	padding: 0;
	}

	/* Header Styling */
	header {
	text-align: center;
	padding: 60px 20px;
	background: linear-gradient(135deg, #6a1b9a, #64b5f6);
	color: #ffffff;
	border-radius: 12px;
	margin-bottom: 30px;
	box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2);
	}

	header h1 {
	font-size: 3.5em;
	font-weight: bold;
	margin-bottom: 10px;
	}

	header h2 {
	font-size: 2em;
	margin-bottom: 15px;
	}

	header p {
	font-size: 1em;
	line-height: 1.8;
	}

	.header-buttons {
	display: flex;
	justify-content: center;
	gap: 15px;
	margin-top: 20px;
	}

	.header-buttons a {
	text-decoration: none;
	font-size: 1.5em;
	padding: 15px 30px;
	border-radius: 30px;
	font-weight: bold;
	background: #ffffff;
	color: #6a1b9a;
	transition: transform 0.3s, background 0.3s;
	box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
	}

	.header-buttons a:hover {
	background: #64b5f6;
	color: #ffffff;
	transform: scale(1.05);
	}

	/* Pre-Tabs Section */

	.pre-tabs {
	text-align: center;
	padding: 40px 20px;
	background: linear-gradient(135deg, #ffffff, #f9fafb);
	border-top: 5px solid #64b5f6;
	border-bottom: 5px solid #6a1b9a;
	}

	.pre-tabs h2, .post-tabs h2 {
	font-size: 3em; /* Increase the size for better visibility */
	}

	.pre-tabs p, .post-tabs p {
	font-size: 2.5em; /* Adjust paragraph text size */
	}

	.pre-tabs h2 {
	color: #333333;
	margin-bottom: 15px;
	}

	.pre-tabs p {
	color: #555555;
	line-height: 1.8;
	}

	/* Tabs Section */
	.tabs {
	margin: 0 auto;
	padding: 20px;
	background: #ffffff;
	border-radius: 12px;
	box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1);
	/* max-width: 1300px; /* change 1 / /
	}

	/* Post-Tabs Section */
	.post-tabs {
	text-align: center;
	padding: 40px 20px;
	background: linear-gradient(135deg, #64b5f6, #6a1b9a);
	color: #ffffff;
	border-radius: 12px;
	margin-top: 30px;
	}

	.post-tabs h2 {
	font-size: 3.4em;
	margin-bottom: 15px;
	}

	.post-tabs p {
	font-size: 2em;
	line-height: 1.8;
	margin-bottom: 20px;
	}

	.post-tabs a {
	text-decoration: none;
	font-size: 1.1em;
	padding: 15px 30px;
	border-radius: 30px;
	font-weight: bold;
	background: #ffffff;
	color: #6a1b9a;
	transition: transform 0.3s, background 0.3s;
	box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
	}

	.post-tabs a:hover {
	background: #6a1b9a;
	color: #ffffff;
	transform: scale(1.05);
	}

	/* Footer */
	footer {
	background: linear-gradient(135deg, #6a1b9a, #8e44ad);
	color: #ffffff;
	text-align: center;
	padding: 40px 20px;
	margin-top: 30px;
	border-radius: 12px;
	box-shadow: 0 4px 10px rgba(0, 0, 0, 0.2);
	}

	footer h2 {
	font-size: 1.5em;
	margin-bottom: 15px;
	}

	footer p {
	font-size: 0.8em;
	line-height: 1.6;
	margin-bottom: 20px;
	}
	/* Link Styling */
	.social-links {
	display: flex;
	justify-content: center;
	gap: 15px; /* Space between links */
	}

	.social-link {
	display: inline-block;
	text-decoration: none;
	color: #ffffff;
	background-color: #6a1b9a; /* Purple button background */
	padding: 10px 20px;
	border-radius: 30px;
	font-size: 16px;
	font-weight: bold;
	transition: all 0.3s ease;
	box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
	}

	.social-link:hover {
	background-color: #8c52d3; /* Darker shade on hover */
	box-shadow: 0 6px 15px rgba(0, 0, 0, 0.2);
	transform: translateY(-2px);
	}

	.social-link:active {
	transform: translateY(1px);
	box-shadow: 0 3px 8px rgba(0, 0, 0, 0.1);
	}
	"""

	# Create the Gradio Interface

	with gr.Blocks(css=css_tech_theme) as demo:
	# Header Section
	gr.Markdown("""
	<header>
	<h1>🏆 Mobile-MMLU Challenge</h1>
	<h2>🚀 Pushing the Limits of Mobile LLMs</h2>
	</header>
	""")
	# # Pre-Tabs Section
	gr.Markdown("""
	<section class="pre-tabs">
	<h2>Why Participate?</h2>
	<p>
	The Mobile-MMLU Benchmark Competition offers a unique opportunity to evaluate your LLMs in real-world mobile scenarios. Join the challenge to drive innovation, showcase your expertise, and shape the future of mobile AI.
	</p>

	</section>
	""")

	# Tabs Section
	with gr.Tabs(elem_id="tabs"):
	# Overview Tab
	with gr.TabItem("📖 Overview"):
	gr.Markdown("""
	<div class="tabs">
	<h2>About the Competition</h2>
	<p>The <strong>Mobile-MMLU Benchmark Competition</strong> is a premier challenge designed to evaluate and advance mobile-optimized Large Language Models (LLMs). It provides an unparalleled opportunity to showcase your model's ability to handle diverse, real-world scenarios while pushing the boundaries of mobile intelligence.</p>
	<p>With a dataset spanning <strong>80 distinct fields</strong> and featuring <strong>16,186 questions</strong>, this competition emphasizes practical application. From education and healthcare to technology and daily life, the questions are crafted to mimic real-world challenges and test the adaptability, accuracy, and efficiency of mobile-compatible LLMs.</p>
	<h3>Why Compete?</h3>
	<p>Participating in this competition allows you to:
	<ul>
	<li>🌟 Showcase your expertise in LLM development and optimization for mobile platforms.</li>
	<li>🚀 Benchmark your model’s performance against others in a highly competitive environment.</li>
	<li>📈 Contribute to advancements in AI for mobile technology, shaping the future of user-centric AI systems.</li>
	</ul></p>
	<h3>How It Works</h3>
	<ul>
	<li>1️⃣ <strong>Download the Dataset:</strong> Access the dataset and instructions on our
	<a href="https://github.com/your-github-repo" target="_blank">GitHub page</a>.</li>
	<li>2️⃣ <strong>Generate Predictions:</strong> Use your LLM to answer the dataset questions.
	Format your predictions as a CSV file.</li>
	<li>3️⃣ <strong>Submit Predictions:</strong> Upload your predictions on this platform.</li>
	<li>4️⃣ <strong>Evaluation:</strong> Submissions are scored based on accuracy.</li>
	<li>5️⃣ <strong>Leaderboard:</strong> View real-time rankings on the leaderboard.</li>
	</ul>
	</div>
	""")

	with gr.TabItem("📤 Submission"):
	with gr.Markdown("""
	<div class="submission-section">
	<h2>Submit Your Predictions</h2>
	<p>Upload your prediction file and provide your model name to evaluate and submit to the leaderboard.</p>
	</div>
	"""):
	with gr.Row(elem_id="submission-fields"):
	file_input = gr.File(label="Upload Prediction CSV", file_types=[".csv"], interactive=True)
	model_name_input = gr.Textbox(label="Model Name", placeholder="Enter your model name")

	with gr.Row(elem_id="submission-results"):
	overall_accuracy_display = gr.Number(label="Overall Accuracy", interactive=False)

	with gr.Row(elem_id="submission-buttons"):
	eval_button = gr.Button("Evaluate")
	submit_button = gr.Button("Prove and Submit to Leaderboard", visible=False)
	eval_status = gr.Textbox(label="Evaluation Status", interactive=False)

	# Define the functions outside the `with` block
	def handle_evaluation(file, model_name):
	# Check if required inputs are provided
	if not file:
	return "Error: Please upload a prediction file.", 0, gr.update(visible=False)
	if not model_name or model_name.strip() == "":
	return "Error: Please enter a model name.", 0, gr.update(visible=False)

	try:
	# Load predictions file
	predictions_df = pd.read_csv(file.name)

	# Validate required columns in the prediction file
	required_columns = ['question_id', 'predicted_answer']
	missing_columns = [col for col in required_columns if col not in predictions_df.columns]
	if missing_columns:
	return (f"Error: Missing required columns in prediction file: {', '.join(missing_columns)}.",
	0, gr.update(visible=False))

	# Perform evaluation
	status, leaderboard = evaluate_predictions(file, model_name, add_to_leaderboard=False)
	if leaderboard.empty:
	overall_accuracy = 0
	else:
	overall_accuracy = leaderboard.iloc[-1]["Overall Accuracy"]

	# Show the submit button after successful evaluation
	return status, overall_accuracy, gr.update(visible=True)

	except Exception as e:
	# Handle unexpected errors
	return f"Error during evaluation: {str(e)}", 0, gr.update(visible=False)

	def handle_submission(file, model_name):
	# Handle leaderboard submission
	status, _ = evaluate_predictions(file, model_name, add_to_leaderboard=True)
	return f"Submission to leaderboard completed: {status}"

	# Connect button clicks to the functions
	eval_button.click(
	handle_evaluation,
	inputs=[file_input, model_name_input],
	outputs=[eval_status, overall_accuracy_display, submit_button],
	)

	submit_button.click(
	handle_submission,
	inputs=[file_input, model_name_input],
	outputs=[eval_status],
	)



	with gr.TabItem("🏅 Leaderboard"):
	leaderboard_table = gr.Dataframe(
	value=load_leaderboard(),
	label="Leaderboard",
	interactive=False,
	wrap=True,
	)
	refresh_button = gr.Button("Refresh Leaderboard")
	refresh_button.click(
	lambda: load_leaderboard(),
	inputs=[],
	outputs=[leaderboard_table],
	)

	# Post-Tabs Section
	gr.Markdown("""
	<section class="post-tabs">
	<h2>Ready to Compete?</h2>
	<h3>
	Submit your predictions today and make your mark in advancing mobile AI technologies.
	Show the world what your model can achieve!
	<h3>
	</section>
	""")

	# Footer Section
	gr.Markdown("""
	<footer>
	<h2>Stay Connected</h2>
	<p>
	Follow us on social media or contact us for any queries. Let's shape the future of AI together!
	</p>
	<div class="social-links">
	<a href="https://website.com" target="_blank" class="social-link">🌐 Website</a>
	<a href="https://github.com/VILA-Lab/Mobile-MMLU" target="_blank" class="social-link">🐙 GitHub</a>
	</div>
	</footer>
	""")

	demo.launch()