Spaces:

MBZUAI-LLM
/

Mobile-MMLU-Challenge

Running

App Files Files Community

SondosMB commited on Dec 20, 2024

Commit

13e4c4d

verified ·

1 Parent(s): 0ddd3ea

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -37

app.py CHANGED Viewed

@@ -170,47 +170,39 @@ import re
 from datetime import datetime
 from huggingface_hub import hf_hub_download
-LEADERBOARD_FILE = "leaderboard.csv"  # File to store all submissions persistently
-GROUND_TRUTH_FILE = "ground_truth.csv"  # File for ground truth data
 LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
-# Disable symlink warnings
 os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
 def initialize_leaderboard_file():
     """
     Ensure the leaderboard file exists and has the correct headers.
     """
     if not os.path.exists(LEADERBOARD_FILE):
-        # Create the file with headers
         pd.DataFrame(columns=[
             "Model Name", "Overall Accuracy", "Valid Accuracy",
             "Correct Predictions", "Total Questions", "Timestamp"
         ]).to_csv(LEADERBOARD_FILE, index=False)
-    else:
-        # Check if the file is empty and write headers if needed
-        if os.stat(LEADERBOARD_FILE).st_size == 0:
-            pd.DataFrame(columns=[
-                "Model Name", "Overall Accuracy", "Valid Accuracy",
-                "Correct Predictions", "Total Questions", "Timestamp"
-            ]).to_csv(LEADERBOARD_FILE, index=False)
 def clean_answer(answer):
-    """
-    Clean and normalize the predicted answers.
-    """
     if pd.isna(answer):
         return None
     answer = str(answer)
     clean = re.sub(r'[^A-Da-d]', '', answer)
-    if clean:
-        return clean[0].upper()
-    return None
 def update_leaderboard(results):
-    """
-    Append new submission results to the leaderboard file.
-    """
     new_entry = {
         "Model Name": results['model_name'],
         "Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
@@ -219,14 +211,10 @@ def update_leaderboard(results):
         "Total Questions": results['total_questions'],
         "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
     }
     new_entry_df = pd.DataFrame([new_entry])
     new_entry_df.to_csv(LEADERBOARD_FILE, mode='a', index=False, header=False)
 def load_leaderboard():
-    """
-    Load all submissions from the leaderboard file.
-    """
     if not os.path.exists(LEADERBOARD_FILE) or os.stat(LEADERBOARD_FILE).st_size == 0:
         return pd.DataFrame({
             "Model Name": [],
@@ -239,17 +227,16 @@ def load_leaderboard():
     return pd.read_csv(LEADERBOARD_FILE)
 def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
-    """
-    Evaluate predictions and optionally add results to the leaderboard.
-    """
     try:
-        # Load ground truth data
         ground_truth_path = hf_hub_download(
             repo_id="SondosMB/ground-truth-dataset",
-            filename=GROUND_TRUTH_FILE,
             use_auth_token=True
         )
         ground_truth_df = pd.read_csv(ground_truth_path)
     except Exception as e:
         return f"Error loading ground truth: {e}", load_leaderboard()
@@ -257,18 +244,15 @@ def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
         return "Prediction file not uploaded.", load_leaderboard()
     try:
-        # Load predictions and merge with ground truth
         predictions_df = pd.read_csv(prediction_file.name)
         merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
         merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
-        # Evaluate predictions
         valid_predictions = merged_df.dropna(subset=['pred_answer'])
         correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
         total_predictions = len(merged_df)
         total_valid_predictions = len(valid_predictions)
-        # Calculate accuracy
         overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
         valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
@@ -280,7 +264,6 @@ def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
             'total_questions': total_predictions,
         }
-        # Update leaderboard only if opted in
         if add_to_leaderboard:
             update_leaderboard(results)
             return "Evaluation completed and added to leaderboard.", load_leaderboard()
@@ -289,15 +272,12 @@ def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
     except Exception as e:
         return f"Error during evaluation: {str(e)}", load_leaderboard()
-# Initialize leaderboard file
 initialize_leaderboard_file()
-# Gradio Interface
 with gr.Blocks() as demo:
     gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
     with gr.Tabs():
-        # Submission Tab
         with gr.TabItem("🏅 Submission"):
             file_input = gr.File(label="Upload Prediction CSV")
             model_name_input = gr.Textbox(label="Model Name", placeholder="Enter your model name")
@@ -316,7 +296,6 @@ with gr.Blocks() as demo:
                 outputs=[eval_status, leaderboard_table_preview],
             )
-        # Leaderboard Tab
         with gr.TabItem("🏅 Leaderboard"):
             leaderboard_table = gr.Dataframe(
                 value=load_leaderboard(),

 from datetime import datetime
 from huggingface_hub import hf_hub_download
+LEADERBOARD_FILE = "leaderboard.csv"
+GROUND_TRUTH_FILE = "ground_truth.csv"
 LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
+# Ensure authentication and suppress warnings
 os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
+HF_TOKEN = os.getenv("HF_TOKEN")
+if not HF_TOKEN:
+    raise ValueError("HF_TOKEN environment variable is not set or invalid.")
 def initialize_leaderboard_file():
     """
     Ensure the leaderboard file exists and has the correct headers.
     """
     if not os.path.exists(LEADERBOARD_FILE):
         pd.DataFrame(columns=[
             "Model Name", "Overall Accuracy", "Valid Accuracy",
             "Correct Predictions", "Total Questions", "Timestamp"
         ]).to_csv(LEADERBOARD_FILE, index=False)
+    elif os.stat(LEADERBOARD_FILE).st_size == 0:
+        pd.DataFrame(columns=[
+            "Model Name", "Overall Accuracy", "Valid Accuracy",
+            "Correct Predictions", "Total Questions", "Timestamp"
+        ]).to_csv(LEADERBOARD_FILE, index=False)
 def clean_answer(answer):
     if pd.isna(answer):
         return None
     answer = str(answer)
     clean = re.sub(r'[^A-Da-d]', '', answer)
+    return clean[0].upper() if clean else None
 def update_leaderboard(results):
     new_entry = {
         "Model Name": results['model_name'],
         "Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
         "Total Questions": results['total_questions'],
         "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
     }
     new_entry_df = pd.DataFrame([new_entry])
     new_entry_df.to_csv(LEADERBOARD_FILE, mode='a', index=False, header=False)
 def load_leaderboard():
     if not os.path.exists(LEADERBOARD_FILE) or os.stat(LEADERBOARD_FILE).st_size == 0:
         return pd.DataFrame({
             "Model Name": [],
     return pd.read_csv(LEADERBOARD_FILE)
 def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
     try:
         ground_truth_path = hf_hub_download(
             repo_id="SondosMB/ground-truth-dataset",
+            filename="ground_truth.csv",
+            repo_type="dataset",
             use_auth_token=True
         )
         ground_truth_df = pd.read_csv(ground_truth_path)
+    except FileNotFoundError:
+        return "Ground truth file not found in the dataset repository.", load_leaderboard()
     except Exception as e:
         return f"Error loading ground truth: {e}", load_leaderboard()
         return "Prediction file not uploaded.", load_leaderboard()
     try:
         predictions_df = pd.read_csv(prediction_file.name)
         merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
         merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
         valid_predictions = merged_df.dropna(subset=['pred_answer'])
         correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
         total_predictions = len(merged_df)
         total_valid_predictions = len(valid_predictions)
         overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
         valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
             'total_questions': total_predictions,
         }
         if add_to_leaderboard:
             update_leaderboard(results)
             return "Evaluation completed and added to leaderboard.", load_leaderboard()
     except Exception as e:
         return f"Error during evaluation: {str(e)}", load_leaderboard()
 initialize_leaderboard_file()
 with gr.Blocks() as demo:
     gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
     with gr.Tabs():
         with gr.TabItem("🏅 Submission"):
             file_input = gr.File(label="Upload Prediction CSV")
             model_name_input = gr.Textbox(label="Model Name", placeholder="Enter your model name")
                 outputs=[eval_status, leaderboard_table_preview],
             )
         with gr.TabItem("🏅 Leaderboard"):
             leaderboard_table = gr.Dataframe(
                 value=load_leaderboard(),