Spaces:

MBZUAI-LLM
/

Mobile-MMLU-Challenge

Sleeping

App Files Files Community

SondosMB commited on Dec 20, 2024

Commit

6bcbc7b

verified ·

1 Parent(s): 9f7748a

Update app.py

Browse files

Files changed (1) hide show

app.py +200 -65

app.py CHANGED Viewed

@@ -176,16 +176,205 @@
 #     demo.launch()
 import gradio as gr
 import pandas as pd
 import os
 import re
 from datetime import datetime
-LEADERBOARD_FILE = "leaderboard.csv"  # File to store leaderboard data
 LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
 def clean_answer(answer):
     if pd.isna(answer):
         return None
     answer = str(answer)
@@ -194,49 +383,9 @@ def clean_answer(answer):
         return clean[0].upper()
     return None
-def evaluate_predictions(prediction_file):
-    ground_truth_file = "ground_truth.csv"
-    if not os.path.exists(ground_truth_file):
-        return "Ground truth file not found."
-    if not prediction_file:
-        return "Prediction file not uploaded."
-    try:
-        predictions_df = pd.read_csv(prediction_file.name)
-        ground_truth_df = pd.read_csv(ground_truth_file)
-        model_name = os.path.basename(prediction_file.name).split('_')[1].split('.')[0]
-        merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
-        merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
-        valid_predictions = merged_df.dropna(subset=['pred_answer'])
-        correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
-        total_predictions = len(merged_df)
-        total_valid_predictions = len(valid_predictions)
-        overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
-        valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
-        results = {
-            'model_name': model_name,
-            'overall_accuracy': overall_accuracy,
-            'valid_accuracy': valid_accuracy,
-            'correct_predictions': correct_predictions,
-            'total_questions': total_predictions,
-        }
-        update_leaderboard(results)
-        return "Evaluation completed successfully! Leaderboard updated."
-    except Exception as e:
-        return f"Error during evaluation: {str(e)}"
-# Build Gradio App
 def update_leaderboard(results):
     """
-    Update the leaderboard file with new results.
     """
     new_entry = {
         "Model Name": results['model_name'],
@@ -247,37 +396,18 @@ def update_leaderboard(results):
         "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
     }
-    # Convert new entry to DataFrame
     new_entry_df = pd.DataFrame([new_entry])
-    # Append to leaderboard file
-    if not os.path.exists(LEADERBOARD_FILE):
-        # If file does not exist, create it with headers
-        new_entry_df.to_csv(LEADERBOARD_FILE, index=False)
-    else:
-        # Append without headers
-        new_entry_df.to_csv(LEADERBOARD_FILE, mode='a', index=False, header=False)
 def load_leaderboard():
     """
-    Load the leaderboard from the leaderboard file.
     """
-    if not os.path.exists(LEADERBOARD_FILE):
-        return pd.DataFrame({
-            "Model Name": [],
-            "Overall Accuracy": [],
-            "Valid Accuracy": [],
-            "Correct Predictions": [],
-            "Total Questions": [],
-            "Timestamp": [],
-        })
     return pd.read_csv(LEADERBOARD_FILE)
 def evaluate_predictions_and_update_leaderboard(prediction_file):
     """
-    Evaluate predictions and update the leaderboard.
     """
     ground_truth_file = "ground_truth.csv"
     if not os.path.exists(ground_truth_file):
@@ -286,18 +416,22 @@ def evaluate_predictions_and_update_leaderboard(prediction_file):
         return "Prediction file not uploaded.", load_leaderboard()
     try:
         predictions_df = pd.read_csv(prediction_file.name)
         ground_truth_df = pd.read_csv(ground_truth_file)
         model_name = os.path.basename(prediction_file.name).split('_')[1].split('.')[0]
         merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
         merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
         valid_predictions = merged_df.dropna(subset=['pred_answer'])
         correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
         total_predictions = len(merged_df)
         total_valid_predictions = len(valid_predictions)
         overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
         valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
@@ -309,12 +443,13 @@ def evaluate_predictions_and_update_leaderboard(prediction_file):
             'total_questions': total_predictions,
         }
         update_leaderboard(results)
         return "Evaluation completed successfully! Leaderboard updated.", load_leaderboard()
     except Exception as e:
         return f"Error during evaluation: {str(e)}", load_leaderboard()
-# Build Gradio App
 with gr.Blocks() as demo:
     gr.Markdown("# Prediction Evaluation Tool with Leaderboard")

 #     demo.launch()
+# import gradio as gr
+# import pandas as pd
+# import os
+# import re
+# from datetime import datetime
+# LEADERBOARD_FILE = "leaderboard.csv"  # File to store leaderboard data
+# LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
+# def clean_answer(answer):
+#     if pd.isna(answer):
+#         return None
+#     answer = str(answer)
+#     clean = re.sub(r'[^A-Da-d]', '', answer)
+#     if clean:
+#         return clean[0].upper()
+#     return None
+# def evaluate_predictions(prediction_file):
+#     ground_truth_file = "ground_truth.csv"
+#     if not os.path.exists(ground_truth_file):
+#         return "Ground truth file not found."
+#     if not prediction_file:
+#         return "Prediction file not uploaded."
+#     try:
+#         predictions_df = pd.read_csv(prediction_file.name)
+#         ground_truth_df = pd.read_csv(ground_truth_file)
+#         model_name = os.path.basename(prediction_file.name).split('_')[1].split('.')[0]
+#         merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
+#         merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
+#         valid_predictions = merged_df.dropna(subset=['pred_answer'])
+#         correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
+#         total_predictions = len(merged_df)
+#         total_valid_predictions = len(valid_predictions)
+#         overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
+#         valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
+#         results = {
+#             'model_name': model_name,
+#             'overall_accuracy': overall_accuracy,
+#             'valid_accuracy': valid_accuracy,
+#             'correct_predictions': correct_predictions,
+#             'total_questions': total_predictions,
+#         }
+#         update_leaderboard(results)
+#         return "Evaluation completed successfully! Leaderboard updated."
+#     except Exception as e:
+#         return f"Error during evaluation: {str(e)}"
+# # Build Gradio App
+# def update_leaderboard(results):
+#     """
+#     Update the leaderboard file with new results.
+#     """
+#     new_entry = {
+#         "Model Name": results['model_name'],
+#         "Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
+#         "Valid Accuracy": round(results['valid_accuracy'] * 100, 2),
+#         "Correct Predictions": results['correct_predictions'],
+#         "Total Questions": results['total_questions'],
+#         "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+#     }
+#     # Convert new entry to DataFrame
+#     new_entry_df = pd.DataFrame([new_entry])
+#     # Append to leaderboard file
+#     if not os.path.exists(LEADERBOARD_FILE):
+#         # If file does not exist, create it with headers
+#         new_entry_df.to_csv(LEADERBOARD_FILE, index=False)
+#     else:
+#         # Append without headers
+#         new_entry_df.to_csv(LEADERBOARD_FILE, mode='a', index=False, header=False)
+# def load_leaderboard():
+#     """
+#     Load the leaderboard from the leaderboard file.
+#     """
+#     if not os.path.exists(LEADERBOARD_FILE):
+#         return pd.DataFrame({
+#             "Model Name": [],
+#             "Overall Accuracy": [],
+#             "Valid Accuracy": [],
+#             "Correct Predictions": [],
+#             "Total Questions": [],
+#             "Timestamp": [],
+#         })
+#     return pd.read_csv(LEADERBOARD_FILE)
+# def evaluate_predictions_and_update_leaderboard(prediction_file):
+#     """
+#     Evaluate predictions and update the leaderboard.
+#     """
+#     ground_truth_file = "ground_truth.csv"
+#     if not os.path.exists(ground_truth_file):
+#         return "Ground truth file not found.", load_leaderboard()
+#     if not prediction_file:
+#         return "Prediction file not uploaded.", load_leaderboard()
+#     try:
+#         predictions_df = pd.read_csv(prediction_file.name)
+#         ground_truth_df = pd.read_csv(ground_truth_file)
+#         model_name = os.path.basename(prediction_file.name).split('_')[1].split('.')[0]
+#         merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
+#         merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
+#         valid_predictions = merged_df.dropna(subset=['pred_answer'])
+#         correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
+#         total_predictions = len(merged_df)
+#         total_valid_predictions = len(valid_predictions)
+#         overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
+#         valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
+#         results = {
+#             'model_name': model_name,
+#             'overall_accuracy': overall_accuracy,
+#             'valid_accuracy': valid_accuracy,
+#             'correct_predictions': correct_predictions,
+#             'total_questions': total_predictions,
+#         }
+#         update_leaderboard(results)
+#         return "Evaluation completed successfully! Leaderboard updated.", load_leaderboard()
+#     except Exception as e:
+#         return f"Error during evaluation: {str(e)}", load_leaderboard()
+# # Build Gradio App
+# with gr.Blocks() as demo:
+#     gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
+#     with gr.Tabs():
+#         # Submission Tab
+#         with gr.TabItem("🏅 Submission"):
+#             file_input = gr.File(label="Upload Prediction CSV")
+#             eval_status = gr.Textbox(label="Evaluation Status", interactive=False)
+#             leaderboard_table_preview = gr.Dataframe(
+#                 value=load_leaderboard(),
+#                 label="Leaderboard (Preview)",
+#                 interactive=False,
+#                 wrap=True,
+#             )
+#             eval_button = gr.Button("Evaluate and Update Leaderboard")
+#             eval_button.click(
+#                 evaluate_predictions_and_update_leaderboard,
+#                 inputs=[file_input],
+#                 outputs=[eval_status, leaderboard_table_preview],
+#             )
+#         # Leaderboard Tab
+#         with gr.TabItem("🏅 Leaderboard"):
+#             leaderboard_table = gr.Dataframe(
+#                 value=load_leaderboard(),
+#                 label="Leaderboard",
+#                 interactive=False,
+#                 wrap=True,
+#             )
+#             refresh_button = gr.Button("Refresh Leaderboard")
+#             refresh_button.click(
+#                 lambda: load_leaderboard(),
+#                 inputs=[],
+#                 outputs=[leaderboard_table],
+#             )
+#     gr.Markdown(f"Last updated on **{LAST_UPDATED}**")
+# demo.launch()
 import gradio as gr
 import pandas as pd
 import os
 import re
 from datetime import datetime
+LEADERBOARD_FILE = "leaderboard.csv"  # File to store all submissions persistently
 LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
+# Initialize the leaderboard file if it doesn't exist
+if not os.path.exists(LEADERBOARD_FILE):
+    pd.DataFrame(columns=[
+        "Model Name", "Overall Accuracy", "Valid Accuracy",
+        "Correct Predictions", "Total Questions", "Timestamp"
+    ]).to_csv(LEADERBOARD_FILE, index=False)
 def clean_answer(answer):
+    """
+    Clean and normalize the predicted answers.
+    """
     if pd.isna(answer):
         return None
     answer = str(answer)
         return clean[0].upper()
     return None
 def update_leaderboard(results):
     """
+    Append new submission results to the leaderboard file.
     """
     new_entry = {
         "Model Name": results['model_name'],
         "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
     }
     new_entry_df = pd.DataFrame([new_entry])
+    new_entry_df.to_csv(LEADERBOARD_FILE, mode='a', index=False, header=False)
 def load_leaderboard():
     """
+    Load all submissions from the leaderboard file.
     """
     return pd.read_csv(LEADERBOARD_FILE)
 def evaluate_predictions_and_update_leaderboard(prediction_file):
     """
+    Evaluate predictions and append results to the leaderboard.
     """
     ground_truth_file = "ground_truth.csv"
     if not os.path.exists(ground_truth_file):
         return "Prediction file not uploaded.", load_leaderboard()
     try:
+        # Load predictions and ground truth
         predictions_df = pd.read_csv(prediction_file.name)
         ground_truth_df = pd.read_csv(ground_truth_file)
         model_name = os.path.basename(prediction_file.name).split('_')[1].split('.')[0]
+        # Merge predictions with ground truth
         merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
         merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
+        # Evaluate predictions
         valid_predictions = merged_df.dropna(subset=['pred_answer'])
         correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
         total_predictions = len(merged_df)
         total_valid_predictions = len(valid_predictions)
+        # Calculate accuracy
         overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
         valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
             'total_questions': total_predictions,
         }
+        # Update leaderboard
         update_leaderboard(results)
         return "Evaluation completed successfully! Leaderboard updated.", load_leaderboard()
     except Exception as e:
         return f"Error during evaluation: {str(e)}", load_leaderboard()
+# Gradio Interface
 with gr.Blocks() as demo:
     gr.Markdown("# Prediction Evaluation Tool with Leaderboard")