Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Merge branch 'main' into select-column-accordion
Browse files- README.md +9 -3
- app.py +15 -15
- src/about.py +88 -5
- src/display/utils.py +1 -1
- style.css +2 -0
README.md
CHANGED
|
@@ -1,14 +1,20 @@
|
|
| 1 |
---
|
| 2 |
title: Open Japanese LLM Leaderboard
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
app_file: app.py
|
| 8 |
pinned: true
|
| 9 |
license: apache-2.0
|
| 10 |
sdk_version: 5.1.0
|
| 11 |
fullWidth: true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
---
|
| 13 |
|
| 14 |
# Start the configuration
|
|
|
|
| 1 |
---
|
| 2 |
title: Open Japanese LLM Leaderboard
|
| 3 |
+
emoji: 🌸
|
| 4 |
+
colorFrom: gray
|
| 5 |
+
colorTo: gray
|
| 6 |
sdk: gradio
|
| 7 |
app_file: app.py
|
| 8 |
pinned: true
|
| 9 |
license: apache-2.0
|
| 10 |
sdk_version: 5.1.0
|
| 11 |
fullWidth: true
|
| 12 |
+
tags:
|
| 13 |
+
- 日本語
|
| 14 |
+
- Japanese
|
| 15 |
+
- leaderboard
|
| 16 |
+
- language:日本語
|
| 17 |
+
- language:Japanese
|
| 18 |
---
|
| 19 |
|
| 20 |
# Start the configuration
|
app.py
CHANGED
|
@@ -79,7 +79,7 @@ def filter_models(
|
|
| 79 |
add_special_tokens_query: list,
|
| 80 |
num_few_shots_query: list,
|
| 81 |
version_query: list,
|
| 82 |
-
backend_query: list,
|
| 83 |
) -> pd.DataFrame:
|
| 84 |
print(f"Initial df shape: {df.shape}")
|
| 85 |
print(f"Initial df content:\n{df}")
|
|
@@ -118,8 +118,8 @@ def filter_models(
|
|
| 118 |
print(f"After version filter: {filtered_df.shape}")
|
| 119 |
|
| 120 |
# Backend フィルタリング
|
| 121 |
-
filtered_df = filtered_df[filtered_df["Backend Library"].isin(backend_query)]
|
| 122 |
-
print(f"After backend filter: {filtered_df.shape}")
|
| 123 |
|
| 124 |
print("Filtered dataframe head:")
|
| 125 |
print(filtered_df.head())
|
|
@@ -188,7 +188,7 @@ def update_table(
|
|
| 188 |
add_special_tokens_query: list,
|
| 189 |
num_few_shots_query: list,
|
| 190 |
version_query: list,
|
| 191 |
-
backend_query: list,
|
| 192 |
query: str,
|
| 193 |
*columns,
|
| 194 |
):
|
|
@@ -206,7 +206,7 @@ def update_table(
|
|
| 206 |
add_special_tokens_query,
|
| 207 |
num_few_shots_query,
|
| 208 |
version_query,
|
| 209 |
-
backend_query,
|
| 210 |
)
|
| 211 |
print(f"filtered_df shape after filter_models: {filtered_df.shape}")
|
| 212 |
|
|
@@ -253,7 +253,7 @@ leaderboard_df = filter_models(
|
|
| 253 |
[i.value.name for i in AddSpecialTokens],
|
| 254 |
[i.value.name for i in NumFewShots],
|
| 255 |
[i.value.name for i in Version],
|
| 256 |
-
[i.value.name for i in Backend],
|
| 257 |
)
|
| 258 |
|
| 259 |
leaderboard_df_filtered = filter_models(
|
|
@@ -264,7 +264,7 @@ leaderboard_df_filtered = filter_models(
|
|
| 264 |
[i.value.name for i in AddSpecialTokens],
|
| 265 |
[i.value.name for i in NumFewShots],
|
| 266 |
[i.value.name for i in Version],
|
| 267 |
-
[i.value.name for i in Backend],
|
| 268 |
)
|
| 269 |
|
| 270 |
# DataFrameの初期化部分のみを修正
|
|
@@ -350,12 +350,12 @@ with gr.Blocks() as demo_leaderboard:
|
|
| 350 |
value=[i.value.name for i in Version],
|
| 351 |
elem_id="filter-columns-version",
|
| 352 |
)
|
| 353 |
-
filter_columns_backend = gr.CheckboxGroup(
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
)
|
| 359 |
|
| 360 |
# DataFrameコンポーネントの初期化
|
| 361 |
leaderboard_table = gr.Dataframe(
|
|
@@ -387,7 +387,7 @@ with gr.Blocks() as demo_leaderboard:
|
|
| 387 |
filter_columns_add_special_tokens.change,
|
| 388 |
filter_columns_num_few_shots.change,
|
| 389 |
filter_columns_version.change,
|
| 390 |
-
filter_columns_backend.change,
|
| 391 |
search_bar.submit,
|
| 392 |
]
|
| 393 |
+ [shown_columns.change for shown_columns in shown_columns_dict.values()],
|
|
@@ -400,7 +400,7 @@ with gr.Blocks() as demo_leaderboard:
|
|
| 400 |
filter_columns_add_special_tokens,
|
| 401 |
filter_columns_num_few_shots,
|
| 402 |
filter_columns_version,
|
| 403 |
-
filter_columns_backend,
|
| 404 |
search_bar,
|
| 405 |
]
|
| 406 |
+ [shown_columns for shown_columns in shown_columns_dict.values()],
|
|
|
|
| 79 |
add_special_tokens_query: list,
|
| 80 |
num_few_shots_query: list,
|
| 81 |
version_query: list,
|
| 82 |
+
# backend_query: list,
|
| 83 |
) -> pd.DataFrame:
|
| 84 |
print(f"Initial df shape: {df.shape}")
|
| 85 |
print(f"Initial df content:\n{df}")
|
|
|
|
| 118 |
print(f"After version filter: {filtered_df.shape}")
|
| 119 |
|
| 120 |
# Backend フィルタリング
|
| 121 |
+
# filtered_df = filtered_df[filtered_df["Backend Library"].isin(backend_query)]
|
| 122 |
+
# print(f"After backend filter: {filtered_df.shape}")
|
| 123 |
|
| 124 |
print("Filtered dataframe head:")
|
| 125 |
print(filtered_df.head())
|
|
|
|
| 188 |
add_special_tokens_query: list,
|
| 189 |
num_few_shots_query: list,
|
| 190 |
version_query: list,
|
| 191 |
+
# backend_query: list,
|
| 192 |
query: str,
|
| 193 |
*columns,
|
| 194 |
):
|
|
|
|
| 206 |
add_special_tokens_query,
|
| 207 |
num_few_shots_query,
|
| 208 |
version_query,
|
| 209 |
+
# backend_query,
|
| 210 |
)
|
| 211 |
print(f"filtered_df shape after filter_models: {filtered_df.shape}")
|
| 212 |
|
|
|
|
| 253 |
[i.value.name for i in AddSpecialTokens],
|
| 254 |
[i.value.name for i in NumFewShots],
|
| 255 |
[i.value.name for i in Version],
|
| 256 |
+
# [i.value.name for i in Backend],
|
| 257 |
)
|
| 258 |
|
| 259 |
leaderboard_df_filtered = filter_models(
|
|
|
|
| 264 |
[i.value.name for i in AddSpecialTokens],
|
| 265 |
[i.value.name for i in NumFewShots],
|
| 266 |
[i.value.name for i in Version],
|
| 267 |
+
# [i.value.name for i in Backend],
|
| 268 |
)
|
| 269 |
|
| 270 |
# DataFrameの初期化部分のみを修正
|
|
|
|
| 350 |
value=[i.value.name for i in Version],
|
| 351 |
elem_id="filter-columns-version",
|
| 352 |
)
|
| 353 |
+
# filter_columns_backend = gr.CheckboxGroup(
|
| 354 |
+
# label="Backend Library",
|
| 355 |
+
# choices=[i.value.name for i in Backend],
|
| 356 |
+
# value=[i.value.name for i in Backend],
|
| 357 |
+
# elem_id="filter-columns-backend",
|
| 358 |
+
# )
|
| 359 |
|
| 360 |
# DataFrameコンポーネントの初期化
|
| 361 |
leaderboard_table = gr.Dataframe(
|
|
|
|
| 387 |
filter_columns_add_special_tokens.change,
|
| 388 |
filter_columns_num_few_shots.change,
|
| 389 |
filter_columns_version.change,
|
| 390 |
+
# filter_columns_backend.change,
|
| 391 |
search_bar.submit,
|
| 392 |
]
|
| 393 |
+ [shown_columns.change for shown_columns in shown_columns_dict.values()],
|
|
|
|
| 400 |
filter_columns_add_special_tokens,
|
| 401 |
filter_columns_num_few_shots,
|
| 402 |
filter_columns_version,
|
| 403 |
+
# filter_columns_backend,
|
| 404 |
search_bar,
|
| 405 |
]
|
| 406 |
+ [shown_columns for shown_columns in shown_columns_dict.values()],
|
src/about.py
CHANGED
|
@@ -36,16 +36,16 @@ class Tasks(Enum):
|
|
| 36 |
EL = Task(
|
| 37 |
"scores", "EL", "EL - エンティティリンキング", TaskType.EL, True
|
| 38 |
) # Entity Linking - エンティティリンキング
|
| 39 |
-
FA = Task("scores", "FA", "FA -
|
| 40 |
-
HE = Task("scores", "HE", "HE -
|
| 41 |
MC = Task(
|
| 42 |
-
"scores", "MC", "MC -
|
| 43 |
-
) # Multiple Choice question answering -
|
| 44 |
MR = Task("scores", "MR", "MR - 数学的推論", TaskType.MR, True) # Mathematical Reasoning - 数学的推論
|
| 45 |
MT = Task("scores", "MT", "MT - 機械翻訳", TaskType.MT, True) # Machine Translation - 機械翻訳
|
| 46 |
NLI = Task("scores", "NLI", "NLI - 自然言語推論", TaskType.NLI, True) # Natural Language Inference - 自然言語推論
|
| 47 |
QA = Task("scores", "QA", "QA - 質問応答", TaskType.QA, True) # Question Answering - 質問応答
|
| 48 |
-
RC = Task("scores", "RC", "RC -
|
| 49 |
SUM = Task("scores", "SUM", "SUM - 要約", TaskType.SUM, True) # Summarization - 要約
|
| 50 |
alt_e_to_j_bert_score_ja_f1 = Task("scores", "alt-e-to-j_bert_score_ja_f1", "ALT E to J BERT Score", TaskType.MT)
|
| 51 |
alt_e_to_j_bleu_ja = Task("scores", "alt-e-to-j_bleu_ja", "ALT E to J BLEU", TaskType.MT)
|
|
@@ -225,6 +225,89 @@ To reproduce our results, please follow the instructions of the evalution tool,
|
|
| 225 |
|
| 226 |
"""
|
| 227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
EVALUATION_QUEUE_TEXT = """
|
| 229 |
## Some good practices before submitting a model
|
| 230 |
|
|
|
|
| 36 |
EL = Task(
|
| 37 |
"scores", "EL", "EL - エンティティリンキング", TaskType.EL, True
|
| 38 |
) # Entity Linking - エンティティリンキング
|
| 39 |
+
FA = Task("scores", "FA", "FA - 基礎解析", TaskType.FA, True) # Fundamental Analysis - 基礎解析
|
| 40 |
+
HE = Task("scores", "HE", "HE - 試験問題", TaskType.HE, True) # Human Examination - 試験問題
|
| 41 |
MC = Task(
|
| 42 |
+
"scores", "MC", "MC - 多肢選択式問題", TaskType.MC, True
|
| 43 |
+
) # Multiple Choice question answering - 多肢選択式問題
|
| 44 |
MR = Task("scores", "MR", "MR - 数学的推論", TaskType.MR, True) # Mathematical Reasoning - 数学的推論
|
| 45 |
MT = Task("scores", "MT", "MT - 機械翻訳", TaskType.MT, True) # Machine Translation - 機械翻訳
|
| 46 |
NLI = Task("scores", "NLI", "NLI - 自然言語推論", TaskType.NLI, True) # Natural Language Inference - 自然言語推論
|
| 47 |
QA = Task("scores", "QA", "QA - 質問応答", TaskType.QA, True) # Question Answering - 質問応答
|
| 48 |
+
RC = Task("scores", "RC", "RC - 文章読解", TaskType.RC, True) # Reading Comprehension - 文章読解
|
| 49 |
SUM = Task("scores", "SUM", "SUM - 要約", TaskType.SUM, True) # Summarization - 要約
|
| 50 |
alt_e_to_j_bert_score_ja_f1 = Task("scores", "alt-e-to-j_bert_score_ja_f1", "ALT E to J BERT Score", TaskType.MT)
|
| 51 |
alt_e_to_j_bleu_ja = Task("scores", "alt-e-to-j_bleu_ja", "ALT E to J BLEU", TaskType.MT)
|
|
|
|
| 225 |
|
| 226 |
"""
|
| 227 |
|
| 228 |
+
LLM_BENCHMARKS_TEXT_JP = """
|
| 229 |
+
## 仕組み
|
| 230 |
+
📈 私たちは評価ツール [llm-jp-eval](https://github.com/llm-jp/llm-jp-eval) を活用し、16のタスクで日本語の大規模言語モデルを評価します。このツールは、様々な評価タスクで日本語LLMを評価するための統一的なフレームワークです。
|
| 231 |
+
|
| 232 |
+
**NLI(自然言語推論)**
|
| 233 |
+
|
| 234 |
+
* `Jamp`、時間推論に焦点を当てた日本語NLIベンチマーク [ソース](https://github.com/tomo-ut/temporalNLI_dataset)(ライセンス CC BY-SA 4.0)
|
| 235 |
+
|
| 236 |
+
* `JaNLI`、日本語の敵対的推論データセット [ソース](https://github.com/verypluming/JaNLI)(ライセンス CC BY-SA 4.0)
|
| 237 |
+
|
| 238 |
+
* `JNLI`、日本語自然言語推論(JGLUEの一部)[ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
|
| 239 |
+
|
| 240 |
+
* `JSeM`、日本語意味論テストセット [ソース](https://github.com/DaisukeBekki/JSeM)(ライセンス BSD 3-Clause)
|
| 241 |
+
|
| 242 |
+
* `JSICK`、構成的知識を含む日本語文 [ソース](https://github.com/verypluming/JSICK)(ライセンス CC BY-SA 4.0)
|
| 243 |
+
|
| 244 |
+
**QA(質問応答)**
|
| 245 |
+
|
| 246 |
+
* `JEMHopQA`、日本語の説明可能なマルチホップ質問応答 [ソース](https://github.com/aiishii/JEMHopQA)(ライセンス CC BY-SA 4.0)
|
| 247 |
+
|
| 248 |
+
* `NIILC`、NIILC質問応答データセット [ソース](https://github.com/mynlp/niilc-qa)(ライセンス CC BY-SA 4.0)
|
| 249 |
+
|
| 250 |
+
* `JAQKET`、クイズを題材とした日本語QAデータセット [ソース](https://www.nlp.ecei.tohoku.ac.jp/projects/jaqket/)(ライセンス CC BY-SA 4.0 - 企業利用には別途ライセンスが必要)
|
| 251 |
+
|
| 252 |
+
**RC(読解)**
|
| 253 |
+
|
| 254 |
+
* `JSQuAD`、SQuADの日本語版(JGLUEの一部)[ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
|
| 255 |
+
|
| 256 |
+
**MC(選択式質問応答)**
|
| 257 |
+
|
| 258 |
+
* `JCommonsenseMorality`、常識的な道徳理解を評価する日本語データセット [ソース](https://github.com/Language-Media-Lab/commonsense-moral-ja)(ライセンス MIT License)
|
| 259 |
+
|
| 260 |
+
* `JCommonsenseQA`、CommonsenseQAの日本語版 [ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
|
| 261 |
+
|
| 262 |
+
* `KUCI`、京都大学常識推論データセット [ソース](https://github.com/ku-nlp/KUCI)(ライセンス CC BY-SA 4.0)
|
| 263 |
+
|
| 264 |
+
**EL(エンティティリンク)**
|
| 265 |
+
|
| 266 |
+
* `chABSA`、アスペクトベースの感情分析データセット [ソース](https://github.com/chakki-works/chABSA-dataset)(ライセンス CC BY-SA 4.0)
|
| 267 |
+
|
| 268 |
+
**FA(基本的な分析)**
|
| 269 |
+
|
| 270 |
+
* `Wikipedia Annotated Corpus`、[ソース](https://github.com/ku-nlp/WikipediaAnnotatedCorpus)(ライセンス CC BY-SA 4.0)
|
| 271 |
+
|
| 272 |
+
タスク一覧:(読解予測、固有表現認識(NER)、依存構造解析、述語項構造解析(PAS)、共参照解析)
|
| 273 |
+
|
| 274 |
+
**MR(数学的推論)**
|
| 275 |
+
|
| 276 |
+
* `MAWPS`、MAWPS(A Math Word Problem Repository)の日本語版 [ソース](https://github.com/nlp-waseda/chain-of-thought-ja-dataset)(ライセンス Apache-2.0)
|
| 277 |
+
|
| 278 |
+
* `MGSM`、MGSM(Multilingual Grade School Math Benchmark)の日本語部分 [ソース](https://huggingface.co/datasets/juletxara/mgsm)(ライセンス MIT License)
|
| 279 |
+
|
| 280 |
+
**MT(機械翻訳)**
|
| 281 |
+
|
| 282 |
+
* `ALT`、アジア言語ツリーバンク(ALT) - 並列コーパス [ソース](https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/index.html)(ライセンス CC BY-SA 4.0)
|
| 283 |
+
|
| 284 |
+
* `WikiCorpus`、京都市に関するWikipedia記事の日本語-英語対訳コーパス [ソース](https://alaginrc.nict.go.jp/WikiCorpus/)(ライセンス CC BY-SA 3.0)
|
| 285 |
+
|
| 286 |
+
**STS(意味的テキスト類似度)**
|
| 287 |
+
|
| 288 |
+
このタスクはllm-jp-evalでサポートされていますが、評価スコアの平均には含まれていません。
|
| 289 |
+
|
| 290 |
+
* `JSTS`、STS(Semantic Textual Similarity)の日本語版(JGLUEの一部)[ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
|
| 291 |
+
|
| 292 |
+
**HE(試験問題)**
|
| 293 |
+
|
| 294 |
+
* `MMLU`、大規模マルチタスク言語理解の測定 [ソース](https://github.com/hendrycks/test)(ライセンス MIT License)
|
| 295 |
+
|
| 296 |
+
* `JMMLU`、日本語大規模マルチタスク言語理解ベンチマーク [ソース](https://github.com/nlp-waseda/JMMLU)(ライセンス CC BY-SA 4.0(3つのタスクはCC BY-NC-ND 4.0ライセンス)
|
| 297 |
+
|
| 298 |
+
**CG(コード生成)**
|
| 299 |
+
|
| 300 |
+
* `MBPP`、Mostly Basic Python Problems(MBPP)の日本語版 [ソース](https://huggingface.co/datasets/llm-jp/mbpp-ja)(ライセンス CC BY-SA 4.0)
|
| 301 |
+
|
| 302 |
+
**SUM(要約)**
|
| 303 |
+
|
| 304 |
+
* `XL-Sum`、44言語の大規模な多言語抽象要約 [ソース](https://github.com/csebuetnlp/xl-sum)(ライセンス CC BY-NC-SA 4.0、非商用ライセンスのため、このデータセットは使用しません。ライセンスと利用規約に明確に同意した場合を除きます)
|
| 305 |
+
|
| 306 |
+
## 再現性
|
| 307 |
+
私たちの結果を再現するには、評価ツール **llm-jp-eval** の指示に従ってください。詳細は [日本語](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) と [英語](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md) でご覧いただけます。
|
| 308 |
+
"""
|
| 309 |
+
|
| 310 |
+
|
| 311 |
EVALUATION_QUEUE_TEXT = """
|
| 312 |
## Some good practices before submitting a model
|
| 313 |
|
src/display/utils.py
CHANGED
|
@@ -56,7 +56,7 @@ auto_eval_column_dict.append(["add_special_tokens", ColumnContent, ColumnContent
|
|
| 56 |
auto_eval_column_dict.append(
|
| 57 |
["llm_jp_eval_version", ColumnContent, ColumnContent("llm-jp-eval version", "str", False)]
|
| 58 |
)
|
| 59 |
-
auto_eval_column_dict.append(["backend", ColumnContent, ColumnContent("Backend Library", "str", False)])
|
| 60 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
| 61 |
|
| 62 |
# We use make dataclass to dynamically fill the scores from Tasks
|
|
|
|
| 56 |
auto_eval_column_dict.append(
|
| 57 |
["llm_jp_eval_version", ColumnContent, ColumnContent("llm-jp-eval version", "str", False)]
|
| 58 |
)
|
| 59 |
+
auto_eval_column_dict.append(["backend", ColumnContent, ColumnContent("Backend Library", "str", False, dummy=True)])
|
| 60 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
| 61 |
|
| 62 |
# We use make dataclass to dynamically fill the scores from Tasks
|
style.css
CHANGED
|
@@ -6,6 +6,8 @@
|
|
| 6 |
}
|
| 7 |
|
| 8 |
/* Hides the final AutoEvalColumn */
|
|
|
|
|
|
|
| 9 |
#llm-benchmark-tab-table table td:last-child,
|
| 10 |
#llm-benchmark-tab-table table th:last-child {
|
| 11 |
display: none;
|
|
|
|
| 6 |
}
|
| 7 |
|
| 8 |
/* Hides the final AutoEvalColumn */
|
| 9 |
+
#llm-benchmark-tab-table table td:nth-last-child(2),
|
| 10 |
+
#llm-benchmark-tab-table table th:nth-last-child(2),
|
| 11 |
#llm-benchmark-tab-table table td:last-child,
|
| 12 |
#llm-benchmark-tab-table table th:last-child {
|
| 13 |
display: none;
|