Spaces:

atlasia
/

Open-Arabic-Dialect-Identification-Leaderboard

Sleeping

App Files Files Community

BounharAbdelaziz commited on Jan 7

Commit

17068d0

1 Parent(s): 4622b34

fix permissions

Browse files

Files changed (1) hide show

utils.py +75 -140

utils.py CHANGED Viewed

@@ -17,6 +17,17 @@ from huggingface_hub import HfApi
 from pathlib import Path
 from constants import *
 def predict_label(text, model, language_mapping_dict, use_mapping=False):
     """
     Runs predictions for a fasttext model.
@@ -183,76 +194,38 @@ def run_eval_one_vs_all(data_test, TARGET_LANG='Morocco'):
     return out
-def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE):
-    # Initialize Hugging Face API
-    api = HfApi()
-    # Get the repository ID from environment variables
-    repo_id = os.environ.get("SPACE_ID")
-    if not repo_id:
-        raise ValueError("This code must be run in a Hugging Face Space")
-    # Create a temporary directory for file operations
-    temp_dir = Path("/tmp")
-    temp_file = temp_dir / DIALECT_CONFUSION_LEADERBOARD_FILE
     try:
-        # Try to download existing file from the Space
-        try:
-            api.hf_hub_download(
-                repo_id=repo_id,
-                filename=DIALECT_CONFUSION_LEADERBOARD_FILE,
-                repo_type="space",
-                local_dir=temp_dir
-            )
-        except Exception:
-            # If file doesn't exist, start with empty data
-            data = []
-        else:
-            # If file exists, read it
-            with open(temp_file, "r") as f:
-                data = json.load(f)
-        # Process the results for each dialect/country
-        for _, row in result_df.iterrows():
-            dialect = row['dialect']
-            # Skip 'Other' class
-            if dialect == 'Other':
-                continue
-            # Find existing target_lang entry or create a new one
-            target_entry = next((item for item in data if target_lang in item), None)
-            if target_entry is None:
-                target_entry = {target_lang: {}}
-                data.append(target_entry)
-            # Get the country-specific data for this target language
-            country_data = target_entry[target_lang]
-            # Initialize the dialect/country entry if it doesn't exist
-            if dialect not in country_data:
-                country_data[dialect] = {}
-            # Update the model metrics under the model name for the given dialect
-            country_data[dialect][model_name] = float(row['false_positive_rate'])
-        # Save updated data to temporary file
-        with open(temp_file, "w") as f:
-            json.dump(data, f, indent=4)
-        # Upload the file back to the Space
-        api.upload_file(
-            path_or_fileobj=str(temp_file),
-            path_in_repo=DIALECT_CONFUSION_LEADERBOARD_FILE,
-            repo_id=repo_id,
-            repo_type="space"
-        )
-    finally:
-        # Clean up temporary file
-        if temp_file.exists():
-            temp_file.unlink()
 def handle_evaluation(model_path, model_path_bin, use_mapping=False):
@@ -371,89 +344,51 @@ def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/su
     return create_leaderboard_display_multilingual(df_multilingual, target_label, default_metrics), status_message
 def update_darija_multilingual_leaderboard(result_df, model_name, MULTI_DIALECTS_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
-    # Initialize Hugging Face API
-    api = HfApi()
-    # Get the repository ID from environment variables
-    # HF Spaces sets this automatically
-    repo_id = os.environ.get("SPACE_ID")
-    if not repo_id:
-        raise ValueError("This code must be run in a Hugging Face Space")
-    # Create a temporary directory for file operations
-    temp_dir = Path("/tmp")
-    temp_file = temp_dir / MULTI_DIALECTS_LEADERBOARD_FILE
     try:
-        # Try to download existing file from the Space
-        try:
-            api.hf_hub_download(
-                repo_id=repo_id,
-                filename=MULTI_DIALECTS_LEADERBOARD_FILE,
-                repo_type="space",
-                local_dir=temp_dir
-            )
-        except Exception:
-            # If file doesn't exist, start with empty data
-            data = []
-        else:
-            # If file exists, read it
-            with open(temp_file, "r") as f:
-                data = json.load(f)
-        # Process the results for each dialect/country
-        for _, row in result_df.iterrows():
-            country = row['country']
-            # skip 'Other' class
-            if country == 'Other':
-                continue
-            # Create metrics dictionary
-            metrics = {
-                'f1_score': float(row['f1_score']),
-                'precision': float(row['precision']),
-                'recall': float(row['recall']),
-                'macro_f1_score': float(row['macro_f1_score']),
-                'micro_f1_score': float(row['micro_f1_score']),
-                'weighted_f1_score': float(row['weighted_f1_score']),
-                'specificity': float(row['specificity']),
-                'false_positive_rate': float(row['false_positive_rate']),
-                'false_negative_rate': float(row['false_negative_rate']),
-                'negative_predictive_value': float(row['negative_predictive_value']),
-                'balanced_accuracy': float(row['balanced_accuracy']),
-                'matthews_correlation': float(row['matthews_correlation']),
-                'n_test_samples': int(row['samples'])
-            }
-            # Find existing country entry or create new one
-            country_entry = next((item for item in data if country in item), None)
-            if country_entry is None:
-                country_entry = {country: {}}
-                data.append(country_entry)
-            # Update the model metrics
-            if country not in country_entry:
-                country_entry[country] = {}
-            country_entry[country][model_name] = metrics
-        # Save updated data to temporary file
-        with open(temp_file, "w") as f:
-            json.dump(data, f, indent=4)
-        # Upload the file back to the Space
-        api.upload_file(
-            path_or_fileobj=str(temp_file),
-            path_in_repo=MULTI_DIALECTS_LEADERBOARD_FILE,
-            repo_id=repo_id,
-            repo_type="space"
-        )
-    finally:
-        # Clean up temporary file
-        if temp_file.exists():
-            temp_file.unlink()
 def load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE):
     current_dir = os.path.dirname(os.path.abspath(__file__))
     DIALECT_CONFUSION_LEADERBOARD_FILE = os.path.join(current_dir, DIALECT_CONFUSION_LEADERBOARD_FILE)

 from pathlib import Path
 from constants import *
+def get_persistent_storage_dir():
+    """Get the persistent storage directory for HF Spaces"""
+    # HF Spaces store persistent data in /data
+    return Path("/data")
+def ensure_storage_dir():
+    """Ensure the storage directory exists"""
+    storage_dir = get_persistent_storage_dir()
+    storage_dir.mkdir(parents=True, exist_ok=True)
+    return storage_dir
 def predict_label(text, model, language_mapping_dict, use_mapping=False):
     """
     Runs predictions for a fasttext model.
     return out
+def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE="darija_leaderboard_binary.json"):
+    storage_dir = ensure_storage_dir()
+    leaderboard_path = storage_dir / DIALECT_CONFUSION_LEADERBOARD_FILE
     try:
+        with open(leaderboard_path, "r") as f:
+            data = json.load(f)
+    except FileNotFoundError:
+        data = []
+    # Process the results for each dialect/country
+    for _, row in result_df.iterrows():
+        dialect = row['dialect']
+        if dialect == 'Other':
+            continue
+        target_entry = next((item for item in data if target_lang in item), None)
+        if target_entry is None:
+            target_entry = {target_lang: {}}
+            data.append(target_entry)
+        country_data = target_entry[target_lang]
+        if dialect not in country_data:
+            country_data[dialect] = {}
+        country_data[dialect][model_name] = float(row['false_positive_rate'])
+    # Save updated leaderboard data
+    with open(leaderboard_path, "w") as f:
+        json.dump(data, f, indent=4)
 def handle_evaluation(model_path, model_path_bin, use_mapping=False):
     return create_leaderboard_display_multilingual(df_multilingual, target_label, default_metrics), status_message
 def update_darija_multilingual_leaderboard(result_df, model_name, MULTI_DIALECTS_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
+    storage_dir = ensure_storage_dir()
+    leaderboard_path = storage_dir / MULTI_DIALECTS_LEADERBOARD_FILE
     try:
+        with open(leaderboard_path, "r") as f:
+            data = json.load(f)
+    except FileNotFoundError:
+        data = []
+    # Process the results for each dialect/country
+    for _, row in result_df.iterrows():
+        country = row['country']
+        if country == 'Other':
+            continue
+        metrics = {
+            'f1_score': float(row['f1_score']),
+            'precision': float(row['precision']),
+            'recall': float(row['recall']),
+            'macro_f1_score': float(row['macro_f1_score']),
+            'micro_f1_score': float(row['micro_f1_score']),
+            'weighted_f1_score': float(row['weighted_f1_score']),
+            'specificity': float(row['specificity']),
+            'false_positive_rate': float(row['false_positive_rate']),
+            'false_negative_rate': float(row['false_negative_rate']),
+            'negative_predictive_value': float(row['negative_predictive_value']),
+            'balanced_accuracy': float(row['balanced_accuracy']),
+            'matthews_correlation': float(row['matthews_correlation']),
+            'n_test_samples': int(row['samples'])
+        }
+        country_entry = next((item for item in data if country in item), None)
+        if country_entry is None:
+            country_entry = {country: {}}
+            data.append(country_entry)
+        if country not in country_entry:
+            country_entry[country] = {}
+        country_entry[country][model_name] = metrics
+    # Save updated leaderboard data
+    with open(leaderboard_path, "w") as f:
+        json.dump(data, f, indent=4)
 def load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE):
     current_dir = os.path.dirname(os.path.abspath(__file__))
     DIALECT_CONFUSION_LEADERBOARD_FILE = os.path.join(current_dir, DIALECT_CONFUSION_LEADERBOARD_FILE)