davidpomerenke commited on
Commit
0cbac6c
·
verified ·
1 Parent(s): b6a7bfd

Upload from GitHub Actions: fix norwegian

Browse files
evals/datasets_/arc.py CHANGED
@@ -1,6 +1,5 @@
1
  import random
2
 
3
- from langcodes import standardize_tag
4
  from rich import print
5
  from models import translate_google, get_google_supported_languages
6
  from tqdm import tqdm
@@ -9,11 +8,11 @@ import asyncio
9
  from tqdm.asyncio import tqdm_asyncio
10
  import os
11
 
12
- from datasets_.util import _get_dataset_config_names, _load_dataset
13
 
14
  slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
15
  tags_uhura_arc_easy = {
16
- standardize_tag(a.split("_")[0], macro=True): a
17
  for a in _get_dataset_config_names(slug_uhura_arc_easy)
18
  if not a.endswith("unmatched")
19
  }
@@ -35,7 +34,7 @@ random.shuffle(common_ids_test)
35
 
36
  slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
37
  tags_uhura_arc_easy_translated = {
38
- standardize_tag(a.split("_")[0], macro=True): a
39
  for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
40
  }
41
 
 
1
  import random
2
 
 
3
  from rich import print
4
  from models import translate_google, get_google_supported_languages
5
  from tqdm import tqdm
 
8
  from tqdm.asyncio import tqdm_asyncio
9
  import os
10
 
11
+ from datasets_.util import _get_dataset_config_names, _load_dataset, standardize_bcp47
12
 
13
  slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
14
  tags_uhura_arc_easy = {
15
+ standardize_bcp47(a.split("_")[0]): a
16
  for a in _get_dataset_config_names(slug_uhura_arc_easy)
17
  if not a.endswith("unmatched")
18
  }
 
34
 
35
  slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
36
  tags_uhura_arc_easy_translated = {
37
+ standardize_bcp47(a.split("_")[0]): a
38
  for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
39
  }
40
 
evals/datasets_/mgsm.py CHANGED
@@ -3,8 +3,8 @@ import os
3
  import random
4
 
5
  from datasets import Dataset, load_dataset
6
- from datasets_.util import _get_dataset_config_names, _load_dataset, cache
7
- from langcodes import Language, standardize_tag
8
  from models import get_google_supported_languages, translate_google
9
  from rich import print
10
  from tqdm import tqdm
@@ -12,20 +12,20 @@ from tqdm.asyncio import tqdm_asyncio
12
 
13
  slug_mgsm = "juletxara/mgsm"
14
  tags_mgsm = {
15
- standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_mgsm)
16
  }
17
  slug_afrimgsm = "masakhane/afrimgsm"
18
  tags_afrimgsm = {
19
- standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_afrimgsm)
20
  }
21
  slug_gsm8kx = "Eurolingua/gsm8kx"
22
  tags_gsm8kx = {
23
- standardize_tag(a, macro=True): a
24
  for a in _get_dataset_config_names(slug_gsm8kx, trust_remote_code=True)
25
  }
26
  slug_gsm_autotranslated = "fair-forward/gsm-autotranslated"
27
  tags_gsm_autotranslated = {
28
- standardize_tag(a, macro=True): a
29
  for a in _get_dataset_config_names(slug_gsm_autotranslated)
30
  }
31
 
 
3
  import random
4
 
5
  from datasets import Dataset, load_dataset
6
+ from datasets_.util import _get_dataset_config_names, _load_dataset, cache, standardize_bcp47
7
+ from langcodes import Language
8
  from models import get_google_supported_languages, translate_google
9
  from rich import print
10
  from tqdm import tqdm
 
12
 
13
  slug_mgsm = "juletxara/mgsm"
14
  tags_mgsm = {
15
+ standardize_bcp47(a): a for a in _get_dataset_config_names(slug_mgsm)
16
  }
17
  slug_afrimgsm = "masakhane/afrimgsm"
18
  tags_afrimgsm = {
19
+ standardize_bcp47(a): a for a in _get_dataset_config_names(slug_afrimgsm)
20
  }
21
  slug_gsm8kx = "Eurolingua/gsm8kx"
22
  tags_gsm8kx = {
23
+ standardize_bcp47(a): a
24
  for a in _get_dataset_config_names(slug_gsm8kx, trust_remote_code=True)
25
  }
26
  slug_gsm_autotranslated = "fair-forward/gsm-autotranslated"
27
  tags_gsm_autotranslated = {
28
+ standardize_bcp47(a): a
29
  for a in _get_dataset_config_names(slug_gsm_autotranslated)
30
  }
31
 
evals/datasets_/mmlu.py CHANGED
@@ -4,7 +4,7 @@ import random
4
  from collections import Counter, defaultdict
5
 
6
  from datasets import Dataset, load_dataset
7
- from datasets_.util import _get_dataset_config_names, _load_dataset, cache
8
  from langcodes import Language, standardize_tag
9
  from models import get_google_supported_languages, translate_google
10
  from rich import print
@@ -24,7 +24,7 @@ def print_datasets_analysis():
24
  ds1 = _load_dataset(slug1, "eng")
25
  print_counts(slug1, ds1["dev"]["subject"], ds1["test"]["subject"])
26
  langs1 = _get_dataset_config_names(slug1)
27
- langs1 = [standardize_tag(a, macro=True) for a in langs1]
28
 
29
  slug2 = "openai/MMMLU" # does not have dev set! – but: these languages are all also present in Global-MMLU
30
  ds2 = _load_dataset(slug2, "FR_FR")
@@ -37,7 +37,7 @@ def print_datasets_analysis():
37
  ds3 = _load_dataset(slug3, "en")
38
  print_counts(slug3, ds3["dev"]["subject"], ds3["test"]["subject"])
39
  langs3 = _get_dataset_config_names(slug3)
40
- langs3 = [standardize_tag(a, macro=True) for a in langs3]
41
 
42
  slug4 = "lighteval/okapi_mmlu"
43
  ds4 = _load_dataset(slug4, "ar", trust_remote_code=True)
@@ -132,11 +132,11 @@ def add_choices(row):
132
 
133
 
134
  tags_afrimmlu = {
135
- standardize_tag(a, macro=True): a
136
  for a in _get_dataset_config_names("masakhane/afrimmlu")
137
  }
138
  tags_global_mmlu = {
139
- standardize_tag(a, macro=True): a
140
  for a in _get_dataset_config_names("CohereForAI/Global-MMLU")
141
  }
142
  tags_okapi = _get_dataset_config_names("lighteval/okapi_mmlu")
@@ -145,7 +145,7 @@ tags_mmlux = set(
145
  for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
146
  )
147
  tags_mmlu_autotranslated = {
148
- standardize_tag(a, macro=True): a
149
  for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
150
  }
151
 
 
4
  from collections import Counter, defaultdict
5
 
6
  from datasets import Dataset, load_dataset
7
+ from datasets_.util import _get_dataset_config_names, _load_dataset, cache, standardize_bcp47
8
  from langcodes import Language, standardize_tag
9
  from models import get_google_supported_languages, translate_google
10
  from rich import print
 
24
  ds1 = _load_dataset(slug1, "eng")
25
  print_counts(slug1, ds1["dev"]["subject"], ds1["test"]["subject"])
26
  langs1 = _get_dataset_config_names(slug1)
27
+ langs1 = [standardize_bcp47(a) for a in langs1]
28
 
29
  slug2 = "openai/MMMLU" # does not have dev set! – but: these languages are all also present in Global-MMLU
30
  ds2 = _load_dataset(slug2, "FR_FR")
 
37
  ds3 = _load_dataset(slug3, "en")
38
  print_counts(slug3, ds3["dev"]["subject"], ds3["test"]["subject"])
39
  langs3 = _get_dataset_config_names(slug3)
40
+ langs3 = [standardize_bcp47(a) for a in langs3]
41
 
42
  slug4 = "lighteval/okapi_mmlu"
43
  ds4 = _load_dataset(slug4, "ar", trust_remote_code=True)
 
132
 
133
 
134
  tags_afrimmlu = {
135
+ standardize_bcp47(a): a
136
  for a in _get_dataset_config_names("masakhane/afrimmlu")
137
  }
138
  tags_global_mmlu = {
139
+ standardize_bcp47(a): a
140
  for a in _get_dataset_config_names("CohereForAI/Global-MMLU")
141
  }
142
  tags_okapi = _get_dataset_config_names("lighteval/okapi_mmlu")
 
145
  for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
146
  )
147
  tags_mmlu_autotranslated = {
148
+ standardize_bcp47(a): a
149
  for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
150
  }
151
 
evals/datasets_/util.py CHANGED
@@ -6,10 +6,28 @@ from datasets import Dataset, get_dataset_config_names, load_dataset
6
  from datasets.exceptions import DatasetNotFoundError
7
  from huggingface_hub.errors import RepositoryNotFoundError
8
  from joblib.memory import Memory
 
9
 
10
  cache = Memory(location=".cache", verbose=0).cache
11
  TOKEN = os.getenv("HUGGINGFACE_ACCESS_TOKEN")
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  @cache
15
  def _get_dataset_config_names(dataset, **kwargs):
 
6
  from datasets.exceptions import DatasetNotFoundError
7
  from huggingface_hub.errors import RepositoryNotFoundError
8
  from joblib.memory import Memory
9
+ from langcodes import standardize_tag
10
 
11
  cache = Memory(location=".cache", verbose=0).cache
12
  TOKEN = os.getenv("HUGGINGFACE_ACCESS_TOKEN")
13
 
14
+ # Macrolanguage mappings: when standardize_tag returns a macrolanguage,
15
+ # map it to the preferred specific variant for consistency across datasets.
16
+ # This ensures results from different benchmarks use the same language code.
17
+ MACROLANGUAGE_MAPPINGS = {
18
+ "no": "nb", # Norwegian -> Norwegian Bokmål (most widely used variant)
19
+ # Add more mappings here as needed, e.g.:
20
+ # "ms": "zsm", # Malay -> Standard Malay
21
+ # "ar": "arb", # Arabic -> Standard Arabic
22
+ }
23
+
24
+
25
+ def standardize_bcp47(tag: str, macro: bool = True) -> str:
26
+ """Standardize a BCP-47 tag with consistent macrolanguage handling."""
27
+
28
+ standardized = standardize_tag(tag, macro=macro)
29
+ return MACROLANGUAGE_MAPPINGS.get(standardized, standardized)
30
+
31
 
32
  @cache
33
  def _get_dataset_config_names(dataset, **kwargs):