Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
AVGに関する説明を追加
Browse files- src/about.py +31 -9
src/about.py
CHANGED
|
@@ -45,11 +45,11 @@ class Tasks(Enum):
|
|
| 45 |
SUM = Task("scores", "SUM", "AVG (SUM)", TaskType.SUM, True) # Summarization - 要約
|
| 46 |
alt_e_to_j_bert_score_ja_f1 = Task("scores", "alt-e-to-j_bert_score_ja_f1", "ALT E to J BERT Score", TaskType.MT)
|
| 47 |
alt_e_to_j_bleu_ja = Task("scores", "alt-e-to-j_bleu_ja", "ALT E to J BLEU", TaskType.MT)
|
| 48 |
-
alt_e_to_j_comet_wmt22 = Task("scores", "alt-e-to-j_comet_wmt22", "ALT E to J COMET WMT22", TaskType.MT)
|
| 49 |
alt_j_to_e_bert_score_en_f1 = Task("scores", "alt-j-to-e_bert_score_en_f1", "ALT J to E BERT Score", TaskType.MT)
|
| 50 |
alt_j_to_e_bleu_en = Task("scores", "alt-j-to-e_bleu_en", "ALT J to E BLEU", TaskType.MT)
|
| 51 |
-
alt_j_to_e_comet_wmt22 = Task("scores", "alt-j-to-e_comet_wmt22", "ALT J to E COMET WMT22", TaskType.MT)
|
| 52 |
-
chabsa_set_f1 = Task("scores", "chabsa_set_f1", "ChABSA", TaskType.EL)
|
| 53 |
commonsensemoralja_exact_match = Task(
|
| 54 |
"scores", "commonsensemoralja_exact_match", "CommonSenseMoralJA", TaskType.MC
|
| 55 |
)
|
|
@@ -61,7 +61,7 @@ class Tasks(Enum):
|
|
| 61 |
jnli_exact_match = Task("scores", "jnli_exact_match", "JNLI", TaskType.NLI)
|
| 62 |
jsem_exact_match = Task("scores", "jsem_exact_match", "JSEM", TaskType.NLI)
|
| 63 |
jsick_exact_match = Task("scores", "jsick_exact_match", "JSICK", TaskType.NLI)
|
| 64 |
-
jsquad_char_f1 = Task("scores", "jsquad_char_f1", "JSquad", TaskType.RC)
|
| 65 |
jsts_pearson = Task(
|
| 66 |
"scores", "jsts_pearson", "JSTS (Pearson)", TaskType.STS
|
| 67 |
) # Semantic Textual Similarity - 意味的類似度
|
|
@@ -69,8 +69,8 @@ class Tasks(Enum):
|
|
| 69 |
"scores", "jsts_spearman", "JSTS (Spearman)", TaskType.STS
|
| 70 |
) # Semantic Textual Similarity - 意味的類似度
|
| 71 |
kuci_exact_match = Task("scores", "kuci_exact_match", "KUCI", TaskType.MC)
|
| 72 |
-
mawps_exact_match = Task("scores", "mawps_exact_match", "MAWPS", TaskType.MR)
|
| 73 |
-
mbpp_code_exec = Task("scores", "mbpp_code_exec", "MBPP (exec)", TaskType.CG)
|
| 74 |
mbpp_pylint_check = Task("scores", "mbpp_pylint_check", "MBPP (pylint)", TaskType.CG)
|
| 75 |
mmlu_en_exact_match = Task("scores", "mmlu_en_exact_match", "MMLU", TaskType.HE)
|
| 76 |
niilc_char_f1 = Task("scores", "niilc_char_f1", "NIILC", TaskType.QA)
|
|
@@ -85,19 +85,19 @@ class Tasks(Enum):
|
|
| 85 |
)
|
| 86 |
wikicorpus_e_to_j_bleu_ja = Task("scores", "wikicorpus-e-to-j_bleu_ja", "WikiCorpus E to J BLEU", TaskType.MT)
|
| 87 |
wikicorpus_e_to_j_comet_wmt22 = Task(
|
| 88 |
-
"scores", "wikicorpus-e-to-j_comet_wmt22", "WikiCorpus E to J COMET WMT22", TaskType.MT
|
| 89 |
)
|
| 90 |
wikicorpus_j_to_e_bert_score_en_f1 = Task(
|
| 91 |
"scores", "wikicorpus-j-to-e_bert_score_en_f1", "WikiCorpus J to E BERT Score", TaskType.MT
|
| 92 |
)
|
| 93 |
wikicorpus_j_to_e_bleu_en = Task("scores", "wikicorpus-j-to-e_bleu_en", "WikiCorpus J to E BLEU", TaskType.MT)
|
| 94 |
wikicorpus_j_to_e_comet_wmt22 = Task(
|
| 95 |
-
"scores", "wikicorpus-j-to-e_comet_wmt22", "WikiCorpus J to E COMET WMT22", TaskType.MT
|
| 96 |
)
|
| 97 |
xlsum_ja_bert_score_ja_f1 = Task("scores", "xlsum_ja_bert_score_ja_f1", "XL-Sum JA BERT Score", TaskType.SUM)
|
| 98 |
xlsum_ja_bleu_ja = Task("scores", "xlsum_ja_bleu_ja", "XL-Sum JA BLEU", TaskType.SUM)
|
| 99 |
xlsum_ja_rouge1 = Task("scores", "xlsum_ja_rouge1", "XL-Sum ROUGE1", TaskType.SUM)
|
| 100 |
-
xlsum_ja_rouge2 = Task("scores", "xlsum_ja_rouge2", "XL-Sum ROUGE2", TaskType.SUM)
|
| 101 |
# xlsum_ja_rouge2_scaling = Task("scores", "xlsum_ja_rouge2_scaling", "XL-Sum JA ROUGE2 Scaling")
|
| 102 |
xlsum_ja_rougeLsum = Task("scores", "xlsum_ja_rougeLsum", "XL-Sum ROUGE-Lsum", TaskType.SUM)
|
| 103 |
|
|
@@ -220,6 +220,17 @@ This task is supported by llm-jp-eval, but it is not included in the evaluation
|
|
| 220 |
## Reproducibility
|
| 221 |
To reproduce our results, please follow the instructions of the evalution tool, **llm-jp-eval** available in [Japanese](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) and in [English](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md).
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
"""
|
| 224 |
|
| 225 |
LLM_BENCHMARKS_TEXT_JA = """
|
|
@@ -302,6 +313,17 @@ LLM_BENCHMARKS_TEXT_JA = """
|
|
| 302 |
|
| 303 |
## 再現性
|
| 304 |
我々の結果を再現するには、評価ツール **llm-jp-eval** の指示に従ってください。詳細は [日本語](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) と [英語](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md) でご覧いただけます。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
"""
|
| 306 |
|
| 307 |
|
|
|
|
| 45 |
SUM = Task("scores", "SUM", "AVG (SUM)", TaskType.SUM, True) # Summarization - 要約
|
| 46 |
alt_e_to_j_bert_score_ja_f1 = Task("scores", "alt-e-to-j_bert_score_ja_f1", "ALT E to J BERT Score", TaskType.MT)
|
| 47 |
alt_e_to_j_bleu_ja = Task("scores", "alt-e-to-j_bleu_ja", "ALT E to J BLEU", TaskType.MT)
|
| 48 |
+
alt_e_to_j_comet_wmt22 = Task("scores", "alt-e-to-j_comet_wmt22", "ALT E to J COMET WMT22 ⭐", TaskType.MT)
|
| 49 |
alt_j_to_e_bert_score_en_f1 = Task("scores", "alt-j-to-e_bert_score_en_f1", "ALT J to E BERT Score", TaskType.MT)
|
| 50 |
alt_j_to_e_bleu_en = Task("scores", "alt-j-to-e_bleu_en", "ALT J to E BLEU", TaskType.MT)
|
| 51 |
+
alt_j_to_e_comet_wmt22 = Task("scores", "alt-j-to-e_comet_wmt22", "ALT J to E COMET WMT22 ⭐", TaskType.MT)
|
| 52 |
+
chabsa_set_f1 = Task("scores", "chabsa_set_f1", "ChABSA ⭐", TaskType.EL)
|
| 53 |
commonsensemoralja_exact_match = Task(
|
| 54 |
"scores", "commonsensemoralja_exact_match", "CommonSenseMoralJA", TaskType.MC
|
| 55 |
)
|
|
|
|
| 61 |
jnli_exact_match = Task("scores", "jnli_exact_match", "JNLI", TaskType.NLI)
|
| 62 |
jsem_exact_match = Task("scores", "jsem_exact_match", "JSEM", TaskType.NLI)
|
| 63 |
jsick_exact_match = Task("scores", "jsick_exact_match", "JSICK", TaskType.NLI)
|
| 64 |
+
jsquad_char_f1 = Task("scores", "jsquad_char_f1", "JSquad ⭐", TaskType.RC)
|
| 65 |
jsts_pearson = Task(
|
| 66 |
"scores", "jsts_pearson", "JSTS (Pearson)", TaskType.STS
|
| 67 |
) # Semantic Textual Similarity - 意味的類似度
|
|
|
|
| 69 |
"scores", "jsts_spearman", "JSTS (Spearman)", TaskType.STS
|
| 70 |
) # Semantic Textual Similarity - 意味的類似度
|
| 71 |
kuci_exact_match = Task("scores", "kuci_exact_match", "KUCI", TaskType.MC)
|
| 72 |
+
mawps_exact_match = Task("scores", "mawps_exact_match", "MAWPS ⭐", TaskType.MR)
|
| 73 |
+
mbpp_code_exec = Task("scores", "mbpp_code_exec", "MBPP (exec) ⭐", TaskType.CG)
|
| 74 |
mbpp_pylint_check = Task("scores", "mbpp_pylint_check", "MBPP (pylint)", TaskType.CG)
|
| 75 |
mmlu_en_exact_match = Task("scores", "mmlu_en_exact_match", "MMLU", TaskType.HE)
|
| 76 |
niilc_char_f1 = Task("scores", "niilc_char_f1", "NIILC", TaskType.QA)
|
|
|
|
| 85 |
)
|
| 86 |
wikicorpus_e_to_j_bleu_ja = Task("scores", "wikicorpus-e-to-j_bleu_ja", "WikiCorpus E to J BLEU", TaskType.MT)
|
| 87 |
wikicorpus_e_to_j_comet_wmt22 = Task(
|
| 88 |
+
"scores", "wikicorpus-e-to-j_comet_wmt22", "WikiCorpus E to J COMET WMT22 ⭐", TaskType.MT
|
| 89 |
)
|
| 90 |
wikicorpus_j_to_e_bert_score_en_f1 = Task(
|
| 91 |
"scores", "wikicorpus-j-to-e_bert_score_en_f1", "WikiCorpus J to E BERT Score", TaskType.MT
|
| 92 |
)
|
| 93 |
wikicorpus_j_to_e_bleu_en = Task("scores", "wikicorpus-j-to-e_bleu_en", "WikiCorpus J to E BLEU", TaskType.MT)
|
| 94 |
wikicorpus_j_to_e_comet_wmt22 = Task(
|
| 95 |
+
"scores", "wikicorpus-j-to-e_comet_wmt22", "WikiCorpus J to E COMET WMT22 ⭐", TaskType.MT
|
| 96 |
)
|
| 97 |
xlsum_ja_bert_score_ja_f1 = Task("scores", "xlsum_ja_bert_score_ja_f1", "XL-Sum JA BERT Score", TaskType.SUM)
|
| 98 |
xlsum_ja_bleu_ja = Task("scores", "xlsum_ja_bleu_ja", "XL-Sum JA BLEU", TaskType.SUM)
|
| 99 |
xlsum_ja_rouge1 = Task("scores", "xlsum_ja_rouge1", "XL-Sum ROUGE1", TaskType.SUM)
|
| 100 |
+
xlsum_ja_rouge2 = Task("scores", "xlsum_ja_rouge2", "XL-Sum ROUGE2 ⭐", TaskType.SUM)
|
| 101 |
# xlsum_ja_rouge2_scaling = Task("scores", "xlsum_ja_rouge2_scaling", "XL-Sum JA ROUGE2 Scaling")
|
| 102 |
xlsum_ja_rougeLsum = Task("scores", "xlsum_ja_rougeLsum", "XL-Sum ROUGE-Lsum", TaskType.SUM)
|
| 103 |
|
|
|
|
| 220 |
## Reproducibility
|
| 221 |
To reproduce our results, please follow the instructions of the evalution tool, **llm-jp-eval** available in [Japanese](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) and in [English](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md).
|
| 222 |
|
| 223 |
+
## Average Score Calculation
|
| 224 |
+
For the following task categories (RC, EL, MR, MT, CG, SUM), the tasks marked with ⭐ are included in the average calculation:
|
| 225 |
+
|
| 226 |
+
Tasks included in average calculation:
|
| 227 |
+
- RC: JSQuAD ⭐
|
| 228 |
+
- EL: ChABSA ⭐
|
| 229 |
+
- MR: MAWPS ⭐
|
| 230 |
+
- MT: ALT E to J COMET WMT22 ⭐, ALT J to E COMET WMT22 ⭐, WikiCorpus E to J COMET WMT22 ⭐, WikiCorpus J to E COMET WMT22 ⭐
|
| 231 |
+
- CG: MBPP (exec) ⭐
|
| 232 |
+
- SUM: XL-Sum ROUGE2 ⭐
|
| 233 |
+
|
| 234 |
"""
|
| 235 |
|
| 236 |
LLM_BENCHMARKS_TEXT_JA = """
|
|
|
|
| 313 |
|
| 314 |
## 再現性
|
| 315 |
我々の結果を再現するには、評価ツール **llm-jp-eval** の指示に従ってください。詳細は [日本語](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) と [英語](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md) でご覧いただけます。
|
| 316 |
+
|
| 317 |
+
## 平均スコアの計算について
|
| 318 |
+
以下のタスクカテゴリー(RC、EL、MR、MT、CG、SUM)において、⭐マークの付いたタスクのみが平均値の計算に含まれます:
|
| 319 |
+
|
| 320 |
+
平均値計算に含まれるタスク:
|
| 321 |
+
- RC:JSQuAD ⭐
|
| 322 |
+
- EL:ChABSA ⭐
|
| 323 |
+
- MR:MAWPS ⭐
|
| 324 |
+
- MT:ALT E to J COMET WMT22 ⭐、ALT J to E COMET WMT22 ⭐、WikiCorpus E to J COMET WMT22 ⭐、WikiCorpus J to E COMET WMT22 ⭐
|
| 325 |
+
- CG:MBPP (exec) ⭐
|
| 326 |
+
- SUM:XL-Sum ROUGE2 ⭐
|
| 327 |
"""
|
| 328 |
|
| 329 |
|