BounharAbdelaziz commited on
Commit
17068d0
·
1 Parent(s): 4622b34

fix permissions

Browse files
Files changed (1) hide show
  1. utils.py +75 -140
utils.py CHANGED
@@ -17,6 +17,17 @@ from huggingface_hub import HfApi
17
  from pathlib import Path
18
  from constants import *
19
 
 
 
 
 
 
 
 
 
 
 
 
20
  def predict_label(text, model, language_mapping_dict, use_mapping=False):
21
  """
22
  Runs predictions for a fasttext model.
@@ -183,76 +194,38 @@ def run_eval_one_vs_all(data_test, TARGET_LANG='Morocco'):
183
 
184
  return out
185
 
186
- def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE):
187
-
188
- # Initialize Hugging Face API
189
- api = HfApi()
190
-
191
- # Get the repository ID from environment variables
192
- repo_id = os.environ.get("SPACE_ID")
193
- if not repo_id:
194
- raise ValueError("This code must be run in a Hugging Face Space")
195
-
196
- # Create a temporary directory for file operations
197
- temp_dir = Path("/tmp")
198
- temp_file = temp_dir / DIALECT_CONFUSION_LEADERBOARD_FILE
199
 
200
  try:
201
- # Try to download existing file from the Space
202
- try:
203
- api.hf_hub_download(
204
- repo_id=repo_id,
205
- filename=DIALECT_CONFUSION_LEADERBOARD_FILE,
206
- repo_type="space",
207
- local_dir=temp_dir
208
- )
209
- except Exception:
210
- # If file doesn't exist, start with empty data
211
- data = []
212
- else:
213
- # If file exists, read it
214
- with open(temp_file, "r") as f:
215
- data = json.load(f)
216
-
217
- # Process the results for each dialect/country
218
- for _, row in result_df.iterrows():
219
- dialect = row['dialect']
220
- # Skip 'Other' class
221
- if dialect == 'Other':
222
- continue
223
-
224
- # Find existing target_lang entry or create a new one
225
- target_entry = next((item for item in data if target_lang in item), None)
226
- if target_entry is None:
227
- target_entry = {target_lang: {}}
228
- data.append(target_entry)
229
-
230
- # Get the country-specific data for this target language
231
- country_data = target_entry[target_lang]
232
-
233
- # Initialize the dialect/country entry if it doesn't exist
234
- if dialect not in country_data:
235
- country_data[dialect] = {}
236
-
237
- # Update the model metrics under the model name for the given dialect
238
- country_data[dialect][model_name] = float(row['false_positive_rate'])
239
 
240
- # Save updated data to temporary file
241
- with open(temp_file, "w") as f:
242
- json.dump(data, f, indent=4)
 
 
 
243
 
244
- # Upload the file back to the Space
245
- api.upload_file(
246
- path_or_fileobj=str(temp_file),
247
- path_in_repo=DIALECT_CONFUSION_LEADERBOARD_FILE,
248
- repo_id=repo_id,
249
- repo_type="space"
250
- )
251
 
252
- finally:
253
- # Clean up temporary file
254
- if temp_file.exists():
255
- temp_file.unlink()
256
 
257
  def handle_evaluation(model_path, model_path_bin, use_mapping=False):
258
 
@@ -371,89 +344,51 @@ def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/su
371
  return create_leaderboard_display_multilingual(df_multilingual, target_label, default_metrics), status_message
372
 
373
  def update_darija_multilingual_leaderboard(result_df, model_name, MULTI_DIALECTS_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
374
- # Initialize Hugging Face API
375
- api = HfApi()
376
-
377
- # Get the repository ID from environment variables
378
- # HF Spaces sets this automatically
379
- repo_id = os.environ.get("SPACE_ID")
380
- if not repo_id:
381
- raise ValueError("This code must be run in a Hugging Face Space")
382
-
383
- # Create a temporary directory for file operations
384
- temp_dir = Path("/tmp")
385
- temp_file = temp_dir / MULTI_DIALECTS_LEADERBOARD_FILE
386
 
387
  try:
388
- # Try to download existing file from the Space
389
- try:
390
- api.hf_hub_download(
391
- repo_id=repo_id,
392
- filename=MULTI_DIALECTS_LEADERBOARD_FILE,
393
- repo_type="space",
394
- local_dir=temp_dir
395
- )
396
- except Exception:
397
- # If file doesn't exist, start with empty data
398
- data = []
399
- else:
400
- # If file exists, read it
401
- with open(temp_file, "r") as f:
402
- data = json.load(f)
403
 
404
- # Process the results for each dialect/country
405
- for _, row in result_df.iterrows():
406
- country = row['country']
407
- # skip 'Other' class
408
- if country == 'Other':
409
- continue
410
-
411
- # Create metrics dictionary
412
- metrics = {
413
- 'f1_score': float(row['f1_score']),
414
- 'precision': float(row['precision']),
415
- 'recall': float(row['recall']),
416
- 'macro_f1_score': float(row['macro_f1_score']),
417
- 'micro_f1_score': float(row['micro_f1_score']),
418
- 'weighted_f1_score': float(row['weighted_f1_score']),
419
- 'specificity': float(row['specificity']),
420
- 'false_positive_rate': float(row['false_positive_rate']),
421
- 'false_negative_rate': float(row['false_negative_rate']),
422
- 'negative_predictive_value': float(row['negative_predictive_value']),
423
- 'balanced_accuracy': float(row['balanced_accuracy']),
424
- 'matthews_correlation': float(row['matthews_correlation']),
425
- 'n_test_samples': int(row['samples'])
426
- }
427
-
428
- # Find existing country entry or create new one
429
- country_entry = next((item for item in data if country in item), None)
430
- if country_entry is None:
431
- country_entry = {country: {}}
432
- data.append(country_entry)
433
 
434
- # Update the model metrics
435
- if country not in country_entry:
436
- country_entry[country] = {}
437
- country_entry[country][model_name] = metrics
 
 
 
 
 
 
 
 
 
 
 
438
 
439
- # Save updated data to temporary file
440
- with open(temp_file, "w") as f:
441
- json.dump(data, f, indent=4)
 
442
 
443
- # Upload the file back to the Space
444
- api.upload_file(
445
- path_or_fileobj=str(temp_file),
446
- path_in_repo=MULTI_DIALECTS_LEADERBOARD_FILE,
447
- repo_id=repo_id,
448
- repo_type="space"
449
- )
450
 
451
- finally:
452
- # Clean up temporary file
453
- if temp_file.exists():
454
- temp_file.unlink()
455
 
456
-
457
  def load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE):
458
  current_dir = os.path.dirname(os.path.abspath(__file__))
459
  DIALECT_CONFUSION_LEADERBOARD_FILE = os.path.join(current_dir, DIALECT_CONFUSION_LEADERBOARD_FILE)
 
17
  from pathlib import Path
18
  from constants import *
19
 
20
+ def get_persistent_storage_dir():
21
+ """Get the persistent storage directory for HF Spaces"""
22
+ # HF Spaces store persistent data in /data
23
+ return Path("/data")
24
+
25
+ def ensure_storage_dir():
26
+ """Ensure the storage directory exists"""
27
+ storage_dir = get_persistent_storage_dir()
28
+ storage_dir.mkdir(parents=True, exist_ok=True)
29
+ return storage_dir
30
+
31
  def predict_label(text, model, language_mapping_dict, use_mapping=False):
32
  """
33
  Runs predictions for a fasttext model.
 
194
 
195
  return out
196
 
197
+ def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE="darija_leaderboard_binary.json"):
198
+ storage_dir = ensure_storage_dir()
199
+ leaderboard_path = storage_dir / DIALECT_CONFUSION_LEADERBOARD_FILE
 
 
 
 
 
 
 
 
 
 
200
 
201
  try:
202
+ with open(leaderboard_path, "r") as f:
203
+ data = json.load(f)
204
+ except FileNotFoundError:
205
+ data = []
206
+
207
+ # Process the results for each dialect/country
208
+ for _, row in result_df.iterrows():
209
+ dialect = row['dialect']
210
+ if dialect == 'Other':
211
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
+ target_entry = next((item for item in data if target_lang in item), None)
214
+ if target_entry is None:
215
+ target_entry = {target_lang: {}}
216
+ data.append(target_entry)
217
+
218
+ country_data = target_entry[target_lang]
219
 
220
+ if dialect not in country_data:
221
+ country_data[dialect] = {}
222
+
223
+ country_data[dialect][model_name] = float(row['false_positive_rate'])
 
 
 
224
 
225
+ # Save updated leaderboard data
226
+ with open(leaderboard_path, "w") as f:
227
+ json.dump(data, f, indent=4)
228
+
229
 
230
  def handle_evaluation(model_path, model_path_bin, use_mapping=False):
231
 
 
344
  return create_leaderboard_display_multilingual(df_multilingual, target_label, default_metrics), status_message
345
 
346
  def update_darija_multilingual_leaderboard(result_df, model_name, MULTI_DIALECTS_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
347
+ storage_dir = ensure_storage_dir()
348
+ leaderboard_path = storage_dir / MULTI_DIALECTS_LEADERBOARD_FILE
 
 
 
 
 
 
 
 
 
 
349
 
350
  try:
351
+ with open(leaderboard_path, "r") as f:
352
+ data = json.load(f)
353
+ except FileNotFoundError:
354
+ data = []
 
 
 
 
 
 
 
 
 
 
 
355
 
356
+ # Process the results for each dialect/country
357
+ for _, row in result_df.iterrows():
358
+ country = row['country']
359
+ if country == 'Other':
360
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
+ metrics = {
363
+ 'f1_score': float(row['f1_score']),
364
+ 'precision': float(row['precision']),
365
+ 'recall': float(row['recall']),
366
+ 'macro_f1_score': float(row['macro_f1_score']),
367
+ 'micro_f1_score': float(row['micro_f1_score']),
368
+ 'weighted_f1_score': float(row['weighted_f1_score']),
369
+ 'specificity': float(row['specificity']),
370
+ 'false_positive_rate': float(row['false_positive_rate']),
371
+ 'false_negative_rate': float(row['false_negative_rate']),
372
+ 'negative_predictive_value': float(row['negative_predictive_value']),
373
+ 'balanced_accuracy': float(row['balanced_accuracy']),
374
+ 'matthews_correlation': float(row['matthews_correlation']),
375
+ 'n_test_samples': int(row['samples'])
376
+ }
377
 
378
+ country_entry = next((item for item in data if country in item), None)
379
+ if country_entry is None:
380
+ country_entry = {country: {}}
381
+ data.append(country_entry)
382
 
383
+ if country not in country_entry:
384
+ country_entry[country] = {}
385
+ country_entry[country][model_name] = metrics
386
+
387
+ # Save updated leaderboard data
388
+ with open(leaderboard_path, "w") as f:
389
+ json.dump(data, f, indent=4)
390
 
 
 
 
 
391
 
 
392
  def load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE):
393
  current_dir = os.path.dirname(os.path.abspath(__file__))
394
  DIALECT_CONFUSION_LEADERBOARD_FILE = os.path.join(current_dir, DIALECT_CONFUSION_LEADERBOARD_FILE)