Spaces:

MBZUAI-LLM
/

Mobile-MMLU-Challenge

Sleeping

App Files Files Community

Mobile-MMLU-Challenge / app.py

SondosMB

Update app.py

28c6e92 verified 11 months ago

raw

history blame

19.3 kB

	import gradio as gr
	import pandas as pd
	import os
	import re
	from datetime import datetime
	from huggingface_hub import hf_hub_download
	from huggingface_hub import HfApi, HfFolder

	LEADERBOARD_FILE = "leaderboard.csv"
	GROUND_TRUTH_FILE = "ground_truth.csv"
	LAST_UPDATED = datetime.now().strftime("%B %d, %Y")

	# Ensure authentication and suppress warnings
	os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
	HF_TOKEN = os.getenv("HF_TOKEN")
	if not HF_TOKEN:
	raise ValueError("HF_TOKEN environment variable is not set or invalid.")

	def initialize_leaderboard_file():
	"""
	Ensure the leaderboard file exists and has the correct headers.
	"""
	if not os.path.exists(LEADERBOARD_FILE):
	pd.DataFrame(columns=[
	"Model Name", "Overall Accuracy", "Valid Accuracy",
	"Correct Predictions", "Total Questions", "Timestamp"
	]).to_csv(LEADERBOARD_FILE, index=False)
	elif os.stat(LEADERBOARD_FILE).st_size == 0:
	pd.DataFrame(columns=[
	"Model Name", "Overall Accuracy", "Valid Accuracy",
	"Correct Predictions", "Total Questions", "Timestamp"
	]).to_csv(LEADERBOARD_FILE, index=False)

	def clean_answer(answer):
	if pd.isna(answer):
	return None
	answer = str(answer)
	clean = re.sub(r'[^A-Da-d]', '', answer)
	return clean[0].upper() if clean else None


	def update_leaderboard(results):
	"""
	Append new submission results to the leaderboard file and push updates to the Hugging Face repository.
	"""
	new_entry = {
	"Model Name": results['model_name'],
	"Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
	"Valid Accuracy": round(results['valid_accuracy'] * 100, 2),
	"Correct Predictions": results['correct_predictions'],
	"Total Questions": results['total_questions'],
	"Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	}

	try:
	# Update the local leaderboard file
	new_entry_df = pd.DataFrame([new_entry])
	file_exists = os.path.exists(LEADERBOARD_FILE)

	new_entry_df.to_csv(
	LEADERBOARD_FILE,
	mode='a', # Append mode
	index=False,
	header=not file_exists # Write header only if the file is new
	)
	print(f"Leaderboard updated successfully at {LEADERBOARD_FILE}")

	# Push the updated file to the Hugging Face repository using HTTP API
	api = HfApi()
	token = HfFolder.get_token()

	api.upload_file(
	path_or_fileobj=LEADERBOARD_FILE,
	path_in_repo="leaderboard.csv",
	repo_id="SondosMB/ss", # Your Space repository
	repo_type="space",
	token=token
	)
	print("Leaderboard changes pushed to Hugging Face repository.")

	except Exception as e:
	print(f"Error updating leaderboard file: {e}")



	def load_leaderboard():
	if not os.path.exists(LEADERBOARD_FILE) or os.stat(LEADERBOARD_FILE).st_size == 0:
	return pd.DataFrame({
	"Model Name": [],
	"Overall Accuracy": [],
	"Valid Accuracy": [],
	"Correct Predictions": [],
	"Total Questions": [],
	"Timestamp": [],
	})
	return pd.read_csv(LEADERBOARD_FILE)

	def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
	try:
	ground_truth_path = hf_hub_download(
	repo_id="SondosMB/ground-truth-dataset",
	filename="ground_truth.csv",
	repo_type="dataset",
	use_auth_token=True
	)
	ground_truth_df = pd.read_csv(ground_truth_path)
	except FileNotFoundError:
	return "Ground truth file not found in the dataset repository.", load_leaderboard()
	except Exception as e:
	return f"Error loading ground truth: {e}", load_leaderboard()

	if not prediction_file:
	return "Prediction file not uploaded.", load_leaderboard()

	try:
	#load predition file
	predictions_df = pd.read_csv(prediction_file.name)
	# Validate required columns in prediction file
	required_columns = ['question_id', 'predicted_answer']
	missing_columns = [col for col in required_columns if col not in predictions_df.columns]
	if missing_columns:
	return (f"Error: Missing required columns in prediction file: {', '.join(missing_columns)}.",
	load_leaderboard())

	# Validate 'Answer' column in ground truth file
	if 'Answer' not in ground_truth_df.columns:
	return "Error: 'Answer' column is missing in the ground truth dataset.", load_leaderboard()
	merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
	merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)

	valid_predictions = merged_df.dropna(subset=['pred_answer'])
	correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
	total_predictions = len(merged_df)
	total_valid_predictions = len(valid_predictions)

	overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
	valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0

	results = {
	'model_name': model_name if model_name else "Unknown Model",
	'overall_accuracy': overall_accuracy,
	'valid_accuracy': valid_accuracy,
	'correct_predictions': correct_predictions,
	'total_questions': total_predictions,
	}

	if add_to_leaderboard:
	update_leaderboard(results)
	return "Evaluation completed and added to leaderboard.", load_leaderboard()
	else:
	return "Evaluation completed but not added to leaderboard.", load_leaderboard()

	except Exception as e:
	return f"Error during evaluation: {str(e)}", load_leaderboard()

	initialize_leaderboard_file()

	# Function to set default mode
	# Function to set default mode
	import gradio as gr

	# # Custom CSS to match website style
	# # Define CSS to match a modern, professional design
	# # Define enhanced CSS for the entire layout

	css_tech_theme = """
	body {
	font-family: 'Roboto', sans-serif;
	background-color: #f4f6fa;
	color: #333333;
	margin: 0;
	padding: 0;
	}

	/* Header Styling */
	header {
	text-align: center;
	padding: 60px 20px;
	background: linear-gradient(135deg, #6a1b9a, #64b5f6);
	color: #ffffff;
	border-radius: 12px;
	margin-bottom: 30px;
	box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2);
	}

	header h1 {
	font-size: 3.5em;
	font-weight: bold;
	margin-bottom: 10px;
	}

	header h2 {
	font-size: 2em;
	margin-bottom: 15px;
	}

	header p {
	font-size: 1em;
	line-height: 1.8;
	}

	.header-buttons {
	display: flex;
	justify-content: center;
	gap: 15px;
	margin-top: 20px;
	}

	.header-buttons a {
	text-decoration: none;
	font-size: 1.5em;
	padding: 15px 30px;
	border-radius: 30px;
	font-weight: bold;
	background: #ffffff;
	color: #6a1b9a;
	transition: transform 0.3s, background 0.3s;
	box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
	}

	.header-buttons a:hover {
	background: #64b5f6;
	color: #ffffff;
	transform: scale(1.05);
	}







	/* Tabs Section */
	.tabs {
	margin: 0 auto;
	padding: 20px;
	background: #ffffff;
	border-radius: 12px;
	box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1);
	/* max-width: 1300px; /* change 1 / /
	}

	/* Post-Tabs Section */
	.post-tabs {
	text-align: center;
	padding: 40px 20px;
	background: linear-gradient(135deg, #64b5f6, #6a1b9a);
	color: #ffffff;
	border-radius: 12px;
	margin-top: 30px;
	}

	.post-tabs h2 {
	color: blue;
	font-size: 3.4em;
	margin-bottom: 15px;
	}

	.post-tabs p {
	font-size: 2em;
	line-height: 1.8;
	margin-bottom: 20px;
	}

	.post-tabs a {
	text-decoration: none;
	font-size: 1.1em;
	padding: 15px 30px;
	border-radius: 30px;
	font-weight: bold;
	background: #ffffff;
	color: #6a1b9a;
	transition: transform 0.3s, background 0.3s;
	box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
	}

	.post-tabs a:hover {
	background: #6a1b9a;
	color: #ffffff;
	transform: scale(1.05);
	}

	/* Footer */
	/*footer {
	background: linear-gradient(135deg, #6a1b9a, #8e44ad);
	color: #ffffff;
	text-align: center;
	padding: 40px 20px;
	margin-top: 30px;
	border-radius: 12px;
	box-shadow: 0 4px 10px rgba(0, 0, 0, 0.2);
	}*/

	footer h2 {
	font-size: 1.5em;
	margin-bottom: 15px;
	}

	footer p {
	font-size: 0.8em;
	line-height: 1.6;
	margin-bottom: 20px;
	}
	/* Link Styling */
	.social-links {
	display: flex;
	justify-content: center;
	gap: 15px; /* Space between links */
	}

	.social-link {
	display: inline-block;
	text-decoration: none;
	color: #ffffff;
	background-color: #6a1b9a; /* Purple button background */
	padding: 10px 20px;
	border-radius: 30px;
	font-size: 16px;
	font-weight: bold;
	transition: all 0.3s ease;
	box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
	}

	.social-link:hover {
	background-color: #8c52d3; /* Darker shade on hover */
	box-shadow: 0 6px 15px rgba(0, 0, 0, 0.2);
	transform: translateY(-2px);
	}

	.social-link:active {
	transform: translateY(1px);
	box-shadow: 0 3px 8px rgba(0, 0, 0, 0.1);
	}


	#submission-buttons {
	display: flex;
	justify-content: center;
	gap: 15px;
	margin-top: 20px;
	}



	/* Buttons Styling */
	#submission-buttons button {
	padding: 12px 25px;
	font-size: 1.1em;
	color: #ffffff;
	background: #6a1b9a;
	border: none;
	border-radius: 30px;
	cursor: pointer;
	font-weight: bold;
	transition: all 0.3s ease;
	box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
	}
	#submission-buttons button:hover {
	background: #8c52d3; /* Slightly lighter purple */
	transform: scale(1.05);
	box-shadow: 0 6px 15px rgba(0, 0, 0, 0.2);
	}
	#submission-buttons button:active {
	background: #5e1287; /* Darker purple */
	transform: scale(0.98);
	box-shadow: 0 3px 10px rgba(0, 0, 0, 0.1);
	}


	"""

	# Create the Gradio Interface

	with gr.Blocks(css=css_tech_theme) as demo:
	# Header Section
	gr.Markdown("""
	<header>
	<h1>🏆 Mobile-MMLU Challenge</h1>
	<h2>🚀 Pushing the Limits of Mobile LLMs</h2>
	</header>
	""")
	# # Pre-Tabs Section
	gr.Markdown("""
	<section class="pre-tabs" style="padding: 40px 20px; background: linear-gradient(135deg, #ffffff, #f9fafb); border: 3px solid #6a1b9a; border-radius: 12px; box-shadow: 0 4px 10px rgba(106, 27, 154, 0.2); margin-bottom: 20px;">
	<h2 style="color: #6a1b9a; text-align: center; font-size: 2.5em;">Why Participate?</h2>
	<p style="font-size: 1.2em; color: #333; text-align: center;">
	The Mobile-MMLU Benchmark Competition offers a unique opportunity to evaluate your LLMs in real-world mobile scenarios.
	Join the challenge to drive innovation, showcase your expertise, and shape the future of mobile AI.</p>
	</section>""")

	# Tabs Section
	with gr.Tabs(elem_id="tabs"):
	# Overview Tab
	with gr.TabItem("📖 Overview"):
	gr.Markdown( """
	<div class="tabs">
	<h2 style="color: #6a1b9a; text-align: center;">About the Competition</h2>
	<p>The <strong>Mobile-MMLU Benchmark Competition</strong> is a premier challenge designed to evaluate and advance mobile-optimized Large Language Models (LLMs). This competition is an excellent opportunity to showcase your model's ability to handle real-world scenarios and excel in mobile intelligence.</p>
	<p>With a dataset spanning <strong>80 distinct fields</strong> and featuring <strong>16,186 questions</strong>, the competition emphasizes practical applications, from education and healthcare to technology and daily life.</p>
	<h3 style="color: #8e44ad;">Why Compete?</h3>
	<p>Participating in this competition allows you to:</p>
	<ul>
	<li>🌟 Showcase your expertise in developing and optimizing LLMs for mobile platforms.</li>
	<li>🚀 Benchmark your model’s performance against others in a highly competitive environment.</li>
	<li>📈 Contribute to advancements in mobile AI, shaping the future of user-centric AI systems.</li>
	</ul>
	<h3 style="color: #6a1b9a;">How It Works</h3>
	<ol>
	<li>1️⃣ <strong>Download the Dataset:</strong> Access the dataset and detailed instructions on the <a href="https://github.com/your-github-repo" target="_blank">GitHub page</a>. Follow the steps to ensure your environment is set up correctly.</li>
	<li>2️⃣ <strong>Generate Predictions:</strong> Use the provided script in the GitHub repository to generate answers. Ensure the output file matches the format in the github </li>
	<li>3️⃣ <strong>Submit Predictions:</strong> Upload your CSV file to the <strong>Submission Page</strong> on this platform.</li>
	<li>4️⃣ <strong>Evaluation:</strong> Your submission will be scored based on accuracy. The results will include overall and valid accuracy metrics.</li>
	<li>5️⃣ <strong>Leaderboard:</strong> Optionally, add your results to the real-time leaderboard to compare your model's performance with others.</li>
	</ol>
	<h3 style="color: #8e44ad;">Resources</h3>
	<ul>
	<li>📂 <a href="https://github.com/your-github-repo" target="_blank">GitHub Repository</a>: Contains the dataset, scripts, and detailed instructions.</li>
	<li>📊 <a href="https://github.com/your-dataset-link" target="_blank">Dataset Link</a>: Direct access to the competition dataset.</li>
	<li>❓ <a href="https://github.com/your-github-repo/issues" target="_blank">Support Page</a>: Use this for queries or issues during participation.</li>
	</ul>
	</div>
	""")

	with gr.TabItem("📤 Submission"):
	gr.Markdown("""
	<div class="submission-section" style="border: 3px solid #6a1b9a; padding: 20px; border-radius: 12px; box-shadow: 0 4px 10px rgba(106, 27, 154, 0.2);">
	<h2 style="color: #6a1b9a; text-align: center;">Submit Your Predictions</h2>
	<p style="font-size: 1.2em; color: #333; text-align: center;">Upload your prediction file and provide your model name to evaluate and optionally submit your results to the leaderboard.</p>
	</div>
	""")
	with gr.Row(elem_id="submission-fields"):
	file_input = gr.File(label="📂 Upload Prediction CSV", file_types=[".csv"], interactive=True,scale=1, min_width=12000)
	model_name_input = gr.Textbox(label="🏷️ Model Name", placeholder="Enter your model name",scale=1, min_width=800)

	with gr.Row(elem_id="submission-results"):
	overall_accuracy_display = gr.Number(label="📊 Overall Accuracy (%)", interactive=False,scale=1,min_width=1200)

	with gr.Row(elem_id="submission-buttons"):
	eval_button = gr.Button("📈 Evaluate",scale=1,min_width=1200)
	submit_button = gr.Button("📤 Prove and Submit to Leaderboard", elem_id="evaluation-status", visible=False,scale=1,min_width=1200)
	eval_status = gr.Textbox(label="🛠️ Evaluation Status", interactive=False,scale=1,min_width=1200)

	# Define the functions outside the `with` block
	def handle_evaluation(file, model_name):
	# Check if required inputs are provided
	if not file:
	return "Error: Please upload a prediction file.", 0, gr.update(visible=False)
	if not model_name or model_name.strip() == "":
	return "Error: Please enter a model name.", 0, gr.update(visible=False)

	try:
	# Load predictions file
	predictions_df = pd.read_csv(file.name)

	# Validate required columns in the prediction file
	required_columns = ['question_id', 'predicted_answer']
	missing_columns = [col for col in required_columns if col not in predictions_df.columns]
	if missing_columns:
	return (f"Error: Missing required columns in prediction file: {', '.join(missing_columns)}.",
	0, gr.update(visible=False))

	# Perform evaluation
	status, leaderboard = evaluate_predictions(file, model_name, add_to_leaderboard=False)
	if leaderboard.empty:
	overall_accuracy = 0
	else:
	overall_accuracy = leaderboard.iloc[-1]["Overall Accuracy"]

	# Show the submit button after successful evaluation
	return status, overall_accuracy, gr.update(visible=True)

	except Exception as e:
	# Handle unexpected errors
	return f"Error during evaluation: {str(e)}", 0, gr.update(visible=False)

	def handle_submission(file, model_name):
	# Handle leaderboard submission
	status, _ = evaluate_predictions(file, model_name, add_to_leaderboard=True)
	return f"Submission to leaderboard completed: {status}"

	# Connect button clicks to the functions
	eval_button.click(
	handle_evaluation,
	inputs=[file_input, model_name_input],
	outputs=[eval_status, overall_accuracy_display, submit_button],
	)

	submit_button.click(
	handle_submission,
	inputs=[file_input, model_name_input],
	outputs=[eval_status],
	)



	with gr.TabItem("🏅 Leaderboard"):
	leaderboard_table = gr.Dataframe(
	value=load_leaderboard(),
	label="Leaderboard",
	interactive=False,
	wrap=True,
	)
	refresh_button = gr.Button("Refresh Leaderboard")
	refresh_button.click(
	lambda: load_leaderboard(),
	inputs=[],
	outputs=[leaderboard_table],
	)

	# Post-Tabs Section
	gr.Markdown("""
	<section class="post-tabs">
	<h2>Ready to Compete?</h2>
	<h3>
	Submit your predictions today and make your mark in advancing mobile AI technologies.
	Show the world what your model can achieve!
	<h3>
	</section>
	""")

	# Footer Section
	gr.Markdown("""
	<footer>
	<h2>Stay Connected</h2>
	<p>
	Follow us on social media or contact us for any queries. Let's shape the future of AI together!
	</p>
	<div class="social-links">
	<a href="https://website.com" target="_blank" class="social-link">🌐 Website</a>
	<a href="https://github.com/VILA-Lab/Mobile-MMLU" target="_blank" class="social-link">🐙 GitHub</a>
	</div>
	</footer>
	""")

	demo.launch()