Commit
·
17068d0
1
Parent(s):
4622b34
fix permissions
Browse files
utils.py
CHANGED
@@ -17,6 +17,17 @@ from huggingface_hub import HfApi
|
|
17 |
from pathlib import Path
|
18 |
from constants import *
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
def predict_label(text, model, language_mapping_dict, use_mapping=False):
|
21 |
"""
|
22 |
Runs predictions for a fasttext model.
|
@@ -183,76 +194,38 @@ def run_eval_one_vs_all(data_test, TARGET_LANG='Morocco'):
|
|
183 |
|
184 |
return out
|
185 |
|
186 |
-
def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE):
|
187 |
-
|
188 |
-
|
189 |
-
api = HfApi()
|
190 |
-
|
191 |
-
# Get the repository ID from environment variables
|
192 |
-
repo_id = os.environ.get("SPACE_ID")
|
193 |
-
if not repo_id:
|
194 |
-
raise ValueError("This code must be run in a Hugging Face Space")
|
195 |
-
|
196 |
-
# Create a temporary directory for file operations
|
197 |
-
temp_dir = Path("/tmp")
|
198 |
-
temp_file = temp_dir / DIALECT_CONFUSION_LEADERBOARD_FILE
|
199 |
|
200 |
try:
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
data = []
|
212 |
-
else:
|
213 |
-
# If file exists, read it
|
214 |
-
with open(temp_file, "r") as f:
|
215 |
-
data = json.load(f)
|
216 |
-
|
217 |
-
# Process the results for each dialect/country
|
218 |
-
for _, row in result_df.iterrows():
|
219 |
-
dialect = row['dialect']
|
220 |
-
# Skip 'Other' class
|
221 |
-
if dialect == 'Other':
|
222 |
-
continue
|
223 |
-
|
224 |
-
# Find existing target_lang entry or create a new one
|
225 |
-
target_entry = next((item for item in data if target_lang in item), None)
|
226 |
-
if target_entry is None:
|
227 |
-
target_entry = {target_lang: {}}
|
228 |
-
data.append(target_entry)
|
229 |
-
|
230 |
-
# Get the country-specific data for this target language
|
231 |
-
country_data = target_entry[target_lang]
|
232 |
-
|
233 |
-
# Initialize the dialect/country entry if it doesn't exist
|
234 |
-
if dialect not in country_data:
|
235 |
-
country_data[dialect] = {}
|
236 |
-
|
237 |
-
# Update the model metrics under the model name for the given dialect
|
238 |
-
country_data[dialect][model_name] = float(row['false_positive_rate'])
|
239 |
|
240 |
-
|
241 |
-
|
242 |
-
|
|
|
|
|
|
|
243 |
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
repo_id=repo_id,
|
249 |
-
repo_type="space"
|
250 |
-
)
|
251 |
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
|
257 |
def handle_evaluation(model_path, model_path_bin, use_mapping=False):
|
258 |
|
@@ -371,89 +344,51 @@ def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/su
|
|
371 |
return create_leaderboard_display_multilingual(df_multilingual, target_label, default_metrics), status_message
|
372 |
|
373 |
def update_darija_multilingual_leaderboard(result_df, model_name, MULTI_DIALECTS_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
# Get the repository ID from environment variables
|
378 |
-
# HF Spaces sets this automatically
|
379 |
-
repo_id = os.environ.get("SPACE_ID")
|
380 |
-
if not repo_id:
|
381 |
-
raise ValueError("This code must be run in a Hugging Face Space")
|
382 |
-
|
383 |
-
# Create a temporary directory for file operations
|
384 |
-
temp_dir = Path("/tmp")
|
385 |
-
temp_file = temp_dir / MULTI_DIALECTS_LEADERBOARD_FILE
|
386 |
|
387 |
try:
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
filename=MULTI_DIALECTS_LEADERBOARD_FILE,
|
393 |
-
repo_type="space",
|
394 |
-
local_dir=temp_dir
|
395 |
-
)
|
396 |
-
except Exception:
|
397 |
-
# If file doesn't exist, start with empty data
|
398 |
-
data = []
|
399 |
-
else:
|
400 |
-
# If file exists, read it
|
401 |
-
with open(temp_file, "r") as f:
|
402 |
-
data = json.load(f)
|
403 |
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
continue
|
410 |
-
|
411 |
-
# Create metrics dictionary
|
412 |
-
metrics = {
|
413 |
-
'f1_score': float(row['f1_score']),
|
414 |
-
'precision': float(row['precision']),
|
415 |
-
'recall': float(row['recall']),
|
416 |
-
'macro_f1_score': float(row['macro_f1_score']),
|
417 |
-
'micro_f1_score': float(row['micro_f1_score']),
|
418 |
-
'weighted_f1_score': float(row['weighted_f1_score']),
|
419 |
-
'specificity': float(row['specificity']),
|
420 |
-
'false_positive_rate': float(row['false_positive_rate']),
|
421 |
-
'false_negative_rate': float(row['false_negative_rate']),
|
422 |
-
'negative_predictive_value': float(row['negative_predictive_value']),
|
423 |
-
'balanced_accuracy': float(row['balanced_accuracy']),
|
424 |
-
'matthews_correlation': float(row['matthews_correlation']),
|
425 |
-
'n_test_samples': int(row['samples'])
|
426 |
-
}
|
427 |
-
|
428 |
-
# Find existing country entry or create new one
|
429 |
-
country_entry = next((item for item in data if country in item), None)
|
430 |
-
if country_entry is None:
|
431 |
-
country_entry = {country: {}}
|
432 |
-
data.append(country_entry)
|
433 |
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
|
439 |
-
|
440 |
-
|
441 |
-
|
|
|
442 |
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
)
|
450 |
|
451 |
-
finally:
|
452 |
-
# Clean up temporary file
|
453 |
-
if temp_file.exists():
|
454 |
-
temp_file.unlink()
|
455 |
|
456 |
-
|
457 |
def load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE):
|
458 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
459 |
DIALECT_CONFUSION_LEADERBOARD_FILE = os.path.join(current_dir, DIALECT_CONFUSION_LEADERBOARD_FILE)
|
|
|
17 |
from pathlib import Path
|
18 |
from constants import *
|
19 |
|
20 |
+
def get_persistent_storage_dir():
|
21 |
+
"""Get the persistent storage directory for HF Spaces"""
|
22 |
+
# HF Spaces store persistent data in /data
|
23 |
+
return Path("/data")
|
24 |
+
|
25 |
+
def ensure_storage_dir():
|
26 |
+
"""Ensure the storage directory exists"""
|
27 |
+
storage_dir = get_persistent_storage_dir()
|
28 |
+
storage_dir.mkdir(parents=True, exist_ok=True)
|
29 |
+
return storage_dir
|
30 |
+
|
31 |
def predict_label(text, model, language_mapping_dict, use_mapping=False):
|
32 |
"""
|
33 |
Runs predictions for a fasttext model.
|
|
|
194 |
|
195 |
return out
|
196 |
|
197 |
+
def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE="darija_leaderboard_binary.json"):
|
198 |
+
storage_dir = ensure_storage_dir()
|
199 |
+
leaderboard_path = storage_dir / DIALECT_CONFUSION_LEADERBOARD_FILE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
try:
|
202 |
+
with open(leaderboard_path, "r") as f:
|
203 |
+
data = json.load(f)
|
204 |
+
except FileNotFoundError:
|
205 |
+
data = []
|
206 |
+
|
207 |
+
# Process the results for each dialect/country
|
208 |
+
for _, row in result_df.iterrows():
|
209 |
+
dialect = row['dialect']
|
210 |
+
if dialect == 'Other':
|
211 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
|
213 |
+
target_entry = next((item for item in data if target_lang in item), None)
|
214 |
+
if target_entry is None:
|
215 |
+
target_entry = {target_lang: {}}
|
216 |
+
data.append(target_entry)
|
217 |
+
|
218 |
+
country_data = target_entry[target_lang]
|
219 |
|
220 |
+
if dialect not in country_data:
|
221 |
+
country_data[dialect] = {}
|
222 |
+
|
223 |
+
country_data[dialect][model_name] = float(row['false_positive_rate'])
|
|
|
|
|
|
|
224 |
|
225 |
+
# Save updated leaderboard data
|
226 |
+
with open(leaderboard_path, "w") as f:
|
227 |
+
json.dump(data, f, indent=4)
|
228 |
+
|
229 |
|
230 |
def handle_evaluation(model_path, model_path_bin, use_mapping=False):
|
231 |
|
|
|
344 |
return create_leaderboard_display_multilingual(df_multilingual, target_label, default_metrics), status_message
|
345 |
|
346 |
def update_darija_multilingual_leaderboard(result_df, model_name, MULTI_DIALECTS_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
|
347 |
+
storage_dir = ensure_storage_dir()
|
348 |
+
leaderboard_path = storage_dir / MULTI_DIALECTS_LEADERBOARD_FILE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
349 |
|
350 |
try:
|
351 |
+
with open(leaderboard_path, "r") as f:
|
352 |
+
data = json.load(f)
|
353 |
+
except FileNotFoundError:
|
354 |
+
data = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
|
356 |
+
# Process the results for each dialect/country
|
357 |
+
for _, row in result_df.iterrows():
|
358 |
+
country = row['country']
|
359 |
+
if country == 'Other':
|
360 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
361 |
|
362 |
+
metrics = {
|
363 |
+
'f1_score': float(row['f1_score']),
|
364 |
+
'precision': float(row['precision']),
|
365 |
+
'recall': float(row['recall']),
|
366 |
+
'macro_f1_score': float(row['macro_f1_score']),
|
367 |
+
'micro_f1_score': float(row['micro_f1_score']),
|
368 |
+
'weighted_f1_score': float(row['weighted_f1_score']),
|
369 |
+
'specificity': float(row['specificity']),
|
370 |
+
'false_positive_rate': float(row['false_positive_rate']),
|
371 |
+
'false_negative_rate': float(row['false_negative_rate']),
|
372 |
+
'negative_predictive_value': float(row['negative_predictive_value']),
|
373 |
+
'balanced_accuracy': float(row['balanced_accuracy']),
|
374 |
+
'matthews_correlation': float(row['matthews_correlation']),
|
375 |
+
'n_test_samples': int(row['samples'])
|
376 |
+
}
|
377 |
|
378 |
+
country_entry = next((item for item in data if country in item), None)
|
379 |
+
if country_entry is None:
|
380 |
+
country_entry = {country: {}}
|
381 |
+
data.append(country_entry)
|
382 |
|
383 |
+
if country not in country_entry:
|
384 |
+
country_entry[country] = {}
|
385 |
+
country_entry[country][model_name] = metrics
|
386 |
+
|
387 |
+
# Save updated leaderboard data
|
388 |
+
with open(leaderboard_path, "w") as f:
|
389 |
+
json.dump(data, f, indent=4)
|
390 |
|
|
|
|
|
|
|
|
|
391 |
|
|
|
392 |
def load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE):
|
393 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
394 |
DIALECT_CONFUSION_LEADERBOARD_FILE = os.path.join(current_dir, DIALECT_CONFUSION_LEADERBOARD_FILE)
|