diff --git a/data_loader.py b/data_loader.py index 73af67220601cfa910197787787275a7cc5d6db8..b5e90d82d8706d74a1b8ad763f12122cf85aff44 100644 --- a/data_loader.py +++ b/data_loader.py @@ -648,7 +648,7 @@ CARDS = """
28
Total Models
-
20 Private
+
25 Private
8 Open Source
@@ -1320,7 +1320,7 @@ evaluate_handler.finish()

Updated Periodically

diff --git a/output/gpt-4.1-2025-04-14/BFCL_v3_irrelevance.parquet b/output/gpt-4.1-2025-04-14/BFCL_v3_irrelevance.parquet new file mode 100644 index 0000000000000000000000000000000000000000..cafd5fcfb487a8046e247c9d6142094b1e958c0e --- /dev/null +++ b/output/gpt-4.1-2025-04-14/BFCL_v3_irrelevance.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9657391d41255ffdbd4de7780e9dfe6a451bb5c6677606ac80bcbaef79541ef +size 38852 diff --git a/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_base_multi_func_call.parquet b/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_base_multi_func_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..765c48c36b9fd5460f283ffec054ebe99f17bbcf --- /dev/null +++ b/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_base_multi_func_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0e7de17ae41587385da3d6f94ee3eed970796a4adf471ff1c44880dd6947ec4 +size 23592 diff --git a/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_base_single_func_call.parquet b/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_base_single_func_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7b324c0139d711aff24b918daf05b8a0a04ea1ef --- /dev/null +++ b/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_base_single_func_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f160b5db157355d39eb9dd59d3bb49251f673d1ca9cca1cfee676bc7ed43cde +size 21766 diff --git a/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_composite.parquet b/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_composite.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ae49abe0d6b4919a4bb30237b88c153acbc0ce22 --- /dev/null +++ b/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_composite.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34943b87685bb6952d23fad943423a814b395c162f2348a04424f68a2a8a0410 +size 43934 diff --git a/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_long_context.parquet b/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_long_context.parquet new file mode 100644 index 0000000000000000000000000000000000000000..15ae5fbe1a3879a11c9a53089dd240a1998e2f65 --- /dev/null +++ b/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_long_context.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5a910bf072b19ea7efab56bd124b1a76be0eefd17184c703a130a5e712103eb +size 36945 diff --git a/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_miss_func.parquet b/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_miss_func.parquet new file mode 100644 index 0000000000000000000000000000000000000000..6f5f45474b56db54477d353d0b765a73f17ba46d --- /dev/null +++ b/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_miss_func.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e28f5fc8daf93f8d1918f064b39e8844e3f6b0890d8c542fa771cb7d0b079884 +size 39152 diff --git a/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_miss_param.parquet b/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_miss_param.parquet new file mode 100644 index 0000000000000000000000000000000000000000..8921586a118f00a7e5aa37d17309c65ac7971150 --- /dev/null +++ b/output/gpt-4.1-2025-04-14/BFCL_v3_multi_turn_miss_param.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:966faf2bf7cb8ddcdab6cf9713143252726b0020ea04063bf61f6b403214cbd7 +size 43237 diff --git a/output/gpt-4.1-2025-04-14/tau_long_context.parquet b/output/gpt-4.1-2025-04-14/tau_long_context.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a9e4ddae5de250a94075cf92ba899edec5f241a7 --- /dev/null +++ b/output/gpt-4.1-2025-04-14/tau_long_context.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64532303dd7e812fb5ef38d72608d0968c5bb09bd50b010b585c294eb366d54e +size 44540 diff --git a/output/gpt-4.1-2025-04-14/toolace_single_func_call_1.parquet b/output/gpt-4.1-2025-04-14/toolace_single_func_call_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..717a5e30f5ef08730084a9ed4b9db190c014bb0d --- /dev/null +++ b/output/gpt-4.1-2025-04-14/toolace_single_func_call_1.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49a80caba95af5acbdb1cad1980ae21b35268d46165a956052080697757cee33 +size 14854 diff --git a/output/gpt-4.1-2025-04-14/toolace_single_func_call_2.parquet b/output/gpt-4.1-2025-04-14/toolace_single_func_call_2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7045a8471b4f66df6dd4598d058b08709748557a --- /dev/null +++ b/output/gpt-4.1-2025-04-14/toolace_single_func_call_2.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a9060eddc7e91f8fdbc8ba9baaa0410cf73ed254dc6f94c1a98d4991a37cdb3 +size 11794 diff --git a/output/gpt-4.1-2025-04-14/xlam_multiple_tool_multiple_call.parquet b/output/gpt-4.1-2025-04-14/xlam_multiple_tool_multiple_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..40f3d5cd315f879bab60fdb530cd3359ce895d04 --- /dev/null +++ b/output/gpt-4.1-2025-04-14/xlam_multiple_tool_multiple_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5b899c060909ba19a23a5fa609fa17cd55113f4aaf1ad2b2f71a5d07645f216 +size 100417 diff --git a/output/gpt-4.1-2025-04-14/xlam_multiple_tool_single_call.parquet b/output/gpt-4.1-2025-04-14/xlam_multiple_tool_single_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2977943c0a8f87fdba253b3eefaa792f48a9bf3d --- /dev/null +++ b/output/gpt-4.1-2025-04-14/xlam_multiple_tool_single_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17d725be681561ab7a1172be69e5239dd772a2ca698b79e7ee00f7c2a82083ae +size 40235 diff --git a/output/gpt-4.1-2025-04-14/xlam_single_tool_multiple_call.parquet b/output/gpt-4.1-2025-04-14/xlam_single_tool_multiple_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c82becb41b9d20a374213090cb8bce80269c423c --- /dev/null +++ b/output/gpt-4.1-2025-04-14/xlam_single_tool_multiple_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d003963d351f1d8a7ee789ea7d6955c17c9a37c0356204d323e11e9267708327 +size 31058 diff --git a/output/gpt-4.1-2025-04-14/xlam_single_tool_single_call.parquet b/output/gpt-4.1-2025-04-14/xlam_single_tool_single_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9a67dece057c6b30c43e42f9be51d8353e31248f --- /dev/null +++ b/output/gpt-4.1-2025-04-14/xlam_single_tool_single_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1970fc916291eafe5514a84c1cb0b074901dc2f76183fce13a059b28bbe5d6aa +size 44787 diff --git a/output/gpt-4.1-2025-04-14/xlam_tool_miss.parquet b/output/gpt-4.1-2025-04-14/xlam_tool_miss.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4e7d5051191bb685275c69666e3af909710ac898 --- /dev/null +++ b/output/gpt-4.1-2025-04-14/xlam_tool_miss.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8167ac2915b4e33c18126edbb38389cd65584326884b7aa779559e1580b10c19 +size 51042 diff --git a/output/gpt-4.1-mini-2025-04-14/BFCL_v3_irrelevance.parquet b/output/gpt-4.1-mini-2025-04-14/BFCL_v3_irrelevance.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1ffdeee12743578afa002bcd5ce781c8e1f20e5e --- /dev/null +++ b/output/gpt-4.1-mini-2025-04-14/BFCL_v3_irrelevance.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fccfcd1ac0cad9fc1f1d59de1d4ed4d898900609a2334026c6ee32c3e6ba8f9f +size 44676 diff --git a/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_base_multi_func_call.parquet b/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_base_multi_func_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9aa28c952125922acb65d5e696d7f676a79d78d3 --- /dev/null +++ b/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_base_multi_func_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e12a309e4e7e26ef831397543b247441f737d3356184f84747f032eb45258e0 +size 23552 diff --git a/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_base_single_func_call.parquet b/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_base_single_func_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..592e6ecadfe905757644b33a8cbb47d02cae571a --- /dev/null +++ b/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_base_single_func_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:535415d1dba1bb7db819c159e8f94328084fc0476e1ae323802489d7a6d3d04a +size 22540 diff --git a/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_composite.parquet b/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_composite.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9db11a160158f910cc706a8a57c9d9f1f4874d92 --- /dev/null +++ b/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_composite.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:735629b98e55fdaf7c303f86b7cef03a8e2bda647f924cc34c329d9d6f546bdf +size 44786 diff --git a/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_long_context.parquet b/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_long_context.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3355165fc4c3c19ba3269cb06054d116a2d99352 --- /dev/null +++ b/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_long_context.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34742abfaf1b7c2d026ace161d64a31ac9b70e8ed40b5759349448e5df1adae8 +size 38675 diff --git a/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_miss_func.parquet b/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_miss_func.parquet new file mode 100644 index 0000000000000000000000000000000000000000..8d2da8f651cde36736018b9d3511121ef069bf35 --- /dev/null +++ b/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_miss_func.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3bf3f9733aa82be34ffe0ed70ce18d8c71760db125ef4dab4d0b83370c807bd +size 43312 diff --git a/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_miss_param.parquet b/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_miss_param.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1a6d870cd483425f22406df6b6c2a364280b36bf --- /dev/null +++ b/output/gpt-4.1-mini-2025-04-14/BFCL_v3_multi_turn_miss_param.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80f6a75b1d8f5b3850fb9203e3118211e40e10f3b29ecfa07acc76105d8ff0fc +size 41539 diff --git a/output/gpt-4.1-mini-2025-04-14/tau_long_context.parquet b/output/gpt-4.1-mini-2025-04-14/tau_long_context.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0934bb1cf06b3e96721975accf4c109a607da4e3 --- /dev/null +++ b/output/gpt-4.1-mini-2025-04-14/tau_long_context.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bf4a3bc10994592e948b32fe969104166452a700f41b5eacee93543368f4fd1 +size 40845 diff --git a/output/gpt-4.1-mini-2025-04-14/toolace_single_func_call_1.parquet b/output/gpt-4.1-mini-2025-04-14/toolace_single_func_call_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a7149618bfec330f9af76dcce8c039e0ae905b53 --- /dev/null +++ b/output/gpt-4.1-mini-2025-04-14/toolace_single_func_call_1.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03b0c4739d04f758e2b19002bccbb7e458d2003ad5d2064cd71fe0f5149de6a8 +size 18332 diff --git a/output/gpt-4.1-mini-2025-04-14/toolace_single_func_call_2.parquet b/output/gpt-4.1-mini-2025-04-14/toolace_single_func_call_2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..429c94a875c904a8671c2bb8e70aa2271af17c7f --- /dev/null +++ b/output/gpt-4.1-mini-2025-04-14/toolace_single_func_call_2.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:680d12f42944f33d23986fb9ef3d09a7c58032598ca07a99357f567ca0dc39f7 +size 12367 diff --git a/output/gpt-4.1-mini-2025-04-14/xlam_multiple_tool_multiple_call.parquet b/output/gpt-4.1-mini-2025-04-14/xlam_multiple_tool_multiple_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..6ad307ab629271ad1166f8b4e92ba40cf9d124a8 --- /dev/null +++ b/output/gpt-4.1-mini-2025-04-14/xlam_multiple_tool_multiple_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bf98a01341a4f2bd7c2a045db5c16f692d76833222bd55688ff321d9f535d6a +size 102460 diff --git a/output/gpt-4.1-mini-2025-04-14/xlam_multiple_tool_single_call.parquet b/output/gpt-4.1-mini-2025-04-14/xlam_multiple_tool_single_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..42903960df8784cfe312c9ae21df7ea6a704196a --- /dev/null +++ b/output/gpt-4.1-mini-2025-04-14/xlam_multiple_tool_single_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c31ac0a4d91c91a6702d2f0cadea80e4245a1b19bc1565409465da2b13fe083 +size 40289 diff --git a/output/gpt-4.1-mini-2025-04-14/xlam_single_tool_multiple_call.parquet b/output/gpt-4.1-mini-2025-04-14/xlam_single_tool_multiple_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c7e9485bdb09b25423b4a5b775b52e69eb5ea440 --- /dev/null +++ b/output/gpt-4.1-mini-2025-04-14/xlam_single_tool_multiple_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f1f7ebefc694dc464a505617dc5daf2fda4421635fdeaad189d2e4de8abdd85 +size 30261 diff --git a/output/gpt-4.1-mini-2025-04-14/xlam_single_tool_single_call.parquet b/output/gpt-4.1-mini-2025-04-14/xlam_single_tool_single_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..55ba17d182066b49a80eee79b844831568f40d95 --- /dev/null +++ b/output/gpt-4.1-mini-2025-04-14/xlam_single_tool_single_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18aa8fc4d1511927ee404e5eeea9006d18c4a20738c3d64471dbaec99b76563d +size 43614 diff --git a/output/gpt-4.1-mini-2025-04-14/xlam_tool_miss.parquet b/output/gpt-4.1-mini-2025-04-14/xlam_tool_miss.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d5c24d5740af549e5dd9a36e5efb8d9153b627d8 --- /dev/null +++ b/output/gpt-4.1-mini-2025-04-14/xlam_tool_miss.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1e7f4023b92c464d248d77fcf950dc16f1b9d77528bfa72f1975a3ac0812763 +size 54881 diff --git a/output/gpt-4.1-nano-2025-04-14/BFCL_v3_irrelevance.parquet b/output/gpt-4.1-nano-2025-04-14/BFCL_v3_irrelevance.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b2ad6197c95cc7bb7dd360483206da24d3c528f1 --- /dev/null +++ b/output/gpt-4.1-nano-2025-04-14/BFCL_v3_irrelevance.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d085185382e506e8780b60a6eceaac6b6bc26f3d1cfdfe88f9c73ade8f6a6f2 +size 28700 diff --git a/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_base_multi_func_call.parquet b/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_base_multi_func_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06e36625a3c6291299aae77071eaaa16d5849892 --- /dev/null +++ b/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_base_multi_func_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c1484a10affc86ba086949da5e3ba6f4dd01eea27b1c9253e662aa8cf32b782 +size 23233 diff --git a/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_base_single_func_call.parquet b/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_base_single_func_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9e57f824425302ff9ded5eee0040546229c637cd --- /dev/null +++ b/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_base_single_func_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c8d3eb0d857fd78b5b9c30a0ac6cf230515cad9a7fa87551fb165d3870b2a4f +size 23672 diff --git a/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_composite.parquet b/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_composite.parquet new file mode 100644 index 0000000000000000000000000000000000000000..63536bc38d6c3614211e61b327a4c4457ad93c13 --- /dev/null +++ b/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_composite.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50978eaf60ca133ce128a33fd4177c8f8dbd950226b2020b784adc54965e0995 +size 40604 diff --git a/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_long_context.parquet b/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_long_context.parquet new file mode 100644 index 0000000000000000000000000000000000000000..db82f63b7ba3ba73e17b7c3ffbe85b1598f9b430 --- /dev/null +++ b/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_long_context.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8806fca6974028bd80dd947a0bed74f85cc21e708cc79242d1f433706aaf119c +size 38518 diff --git a/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_miss_func.parquet b/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_miss_func.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9abafda526bdad9efb6ae4123e07df26eb92c339 --- /dev/null +++ b/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_miss_func.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e64dd6f4dfa2f9a5c6baa832d133784010996666b393268e8c2f9a773c8d880 +size 39577 diff --git a/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_miss_param.parquet b/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_miss_param.parquet new file mode 100644 index 0000000000000000000000000000000000000000..42debc7ab3daf66f46f71ef9b976fba8f2eaaa0a --- /dev/null +++ b/output/gpt-4.1-nano-2025-04-14/BFCL_v3_multi_turn_miss_param.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48f0bcadd582afbe243186384d30c48d67986c852d51ed5cf2874cb5c316989a +size 40730 diff --git a/output/gpt-4.1-nano-2025-04-14/tau_long_context.parquet b/output/gpt-4.1-nano-2025-04-14/tau_long_context.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9ef2da5589dd2a7185541b34c7305356f7efef44 --- /dev/null +++ b/output/gpt-4.1-nano-2025-04-14/tau_long_context.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec16220b59b3a8a36deff0f0bc200a8b693a2a719f817615bf11478b08c47447 +size 40606 diff --git a/output/gpt-4.1-nano-2025-04-14/toolace_single_func_call_1.parquet b/output/gpt-4.1-nano-2025-04-14/toolace_single_func_call_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1d32e916a4fb6d153d672057c4f2dce128c93a7c --- /dev/null +++ b/output/gpt-4.1-nano-2025-04-14/toolace_single_func_call_1.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22bdc58ad908b88fac7d1e6eb6263596b6d0f8cf3ca1ee9a34fa2c250cbed6a8 +size 15891 diff --git a/output/gpt-4.1-nano-2025-04-14/toolace_single_func_call_2.parquet b/output/gpt-4.1-nano-2025-04-14/toolace_single_func_call_2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e9a8d410b77744186a31711f296e29cdaeda2f57 --- /dev/null +++ b/output/gpt-4.1-nano-2025-04-14/toolace_single_func_call_2.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dcc1ead32b72b83b86f7811f0eccfc2fee69aa30e64feb0095b362acfd20d44 +size 11274 diff --git a/output/gpt-4.1-nano-2025-04-14/xlam_multiple_tool_multiple_call.parquet b/output/gpt-4.1-nano-2025-04-14/xlam_multiple_tool_multiple_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..47a13fbb391fb80069b62121c8d1bf3c5591adbe --- /dev/null +++ b/output/gpt-4.1-nano-2025-04-14/xlam_multiple_tool_multiple_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8316f6de7d0f141aa4543642b78962fd30c7edf52a4092ac82f7e4a52af93ab +size 101127 diff --git a/output/gpt-4.1-nano-2025-04-14/xlam_multiple_tool_single_call.parquet b/output/gpt-4.1-nano-2025-04-14/xlam_multiple_tool_single_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..14c0ca4d29cb5c3933660139992bda3ff897722f --- /dev/null +++ b/output/gpt-4.1-nano-2025-04-14/xlam_multiple_tool_single_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2589cec4f2cc428155d7a6386dc707935fe7e0f5e70554550ff3cb9664ace8fb +size 42309 diff --git a/output/gpt-4.1-nano-2025-04-14/xlam_single_tool_multiple_call.parquet b/output/gpt-4.1-nano-2025-04-14/xlam_single_tool_multiple_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..6cbaee06897641068279b6560ab5ed53f81aa6de --- /dev/null +++ b/output/gpt-4.1-nano-2025-04-14/xlam_single_tool_multiple_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c549fd43116a5ed343c5829e080c70f1a5b9475eb8bb4b978be27551edcf9eb +size 31369 diff --git a/output/gpt-4.1-nano-2025-04-14/xlam_single_tool_single_call.parquet b/output/gpt-4.1-nano-2025-04-14/xlam_single_tool_single_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..cbe6b82c5c0820ebe6a5d5f0977d2e6b7428223b --- /dev/null +++ b/output/gpt-4.1-nano-2025-04-14/xlam_single_tool_single_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97872c5db138fd78c790030526253f7437e522df850d2fa26c722dbe44abb696 +size 45933 diff --git a/output/gpt-4.1-nano-2025-04-14/xlam_tool_miss.parquet b/output/gpt-4.1-nano-2025-04-14/xlam_tool_miss.parquet new file mode 100644 index 0000000000000000000000000000000000000000..82e7cdc18e2cb8d14ed22e7a9031bc271f6f101b --- /dev/null +++ b/output/gpt-4.1-nano-2025-04-14/xlam_tool_miss.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80720e28be58215851a00a1437bcb1a02032321ee99f497771248ff605285541 +size 53185 diff --git a/output/o3-2025-04-16/BFCL_v3_irrelevance.parquet b/output/o3-2025-04-16/BFCL_v3_irrelevance.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c5802b8b7ed4056c42de0d6a17224dc2f8044ad0 --- /dev/null +++ b/output/o3-2025-04-16/BFCL_v3_irrelevance.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:792d5f48891b2d40be615d33b2b78381610e7b8a776f13f0794ab687523fd3d6 +size 61887 diff --git a/output/o3-2025-04-16/BFCL_v3_multi_turn_base_multi_func_call.parquet b/output/o3-2025-04-16/BFCL_v3_multi_turn_base_multi_func_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d28e6e67ba056abfb6158c386bba7e59d4ebc417 --- /dev/null +++ b/output/o3-2025-04-16/BFCL_v3_multi_turn_base_multi_func_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcdd60cc53f3bb310004c3fc584d345921f75106040a66423e3171f45e54ef86 +size 24231 diff --git a/output/o3-2025-04-16/BFCL_v3_multi_turn_base_single_func_call.parquet b/output/o3-2025-04-16/BFCL_v3_multi_turn_base_single_func_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..911377f87795f81549489b9acd25b0c5e4852152 --- /dev/null +++ b/output/o3-2025-04-16/BFCL_v3_multi_turn_base_single_func_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27606cdd3aa865a4f79fdd0287e0578b2b74f2b3e9cdce307507d06b01da6814 +size 23076 diff --git a/output/o3-2025-04-16/BFCL_v3_multi_turn_composite.parquet b/output/o3-2025-04-16/BFCL_v3_multi_turn_composite.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0989351918321b35e720abc0d0bcfb1c12e485bc --- /dev/null +++ b/output/o3-2025-04-16/BFCL_v3_multi_turn_composite.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fbc0feb6b0a719b02c17f9cbd947bcc4ef5ae4e7c0f5cbe4e4cb0c2021a548f +size 47481 diff --git a/output/o3-2025-04-16/BFCL_v3_multi_turn_long_context.parquet b/output/o3-2025-04-16/BFCL_v3_multi_turn_long_context.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d076f21941865cc0bd3ba8f5d9b6b579a6fad151 --- /dev/null +++ b/output/o3-2025-04-16/BFCL_v3_multi_turn_long_context.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61ab959085e483007d9f91af03064111c4a52ccafd668e3ace401f5e2feff291 +size 40657 diff --git a/output/o3-2025-04-16/BFCL_v3_multi_turn_miss_func.parquet b/output/o3-2025-04-16/BFCL_v3_multi_turn_miss_func.parquet new file mode 100644 index 0000000000000000000000000000000000000000..8fe5efc2eff43f3da0e06fc78708caa050ab8e7f --- /dev/null +++ b/output/o3-2025-04-16/BFCL_v3_multi_turn_miss_func.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dea705ce4c306cfcee64e6bef2a9a0ad04b4eb056c89a6c69e35b35078445e46 +size 38292 diff --git a/output/o3-2025-04-16/BFCL_v3_multi_turn_miss_param.parquet b/output/o3-2025-04-16/BFCL_v3_multi_turn_miss_param.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a6003cc618da0ccf3837cca1a41cc925c753ee38 --- /dev/null +++ b/output/o3-2025-04-16/BFCL_v3_multi_turn_miss_param.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3db73cf29d5a9790a8491a62ea0a44af495005074c9b2b15eea4468f8b61137 +size 44744 diff --git a/output/o3-2025-04-16/tau_long_context.parquet b/output/o3-2025-04-16/tau_long_context.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d2854be081439bdb8b4f6c52d655b2609286c046 --- /dev/null +++ b/output/o3-2025-04-16/tau_long_context.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:889462c24fca77c094c8dc11705618076d018bd7ac74de958cf0ff9aa8d30a45 +size 46882 diff --git a/output/o3-2025-04-16/toolace_single_func_call_1.parquet b/output/o3-2025-04-16/toolace_single_func_call_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..242d7cdc2d29eaad165ec498988d11ad71158b74 --- /dev/null +++ b/output/o3-2025-04-16/toolace_single_func_call_1.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5fdf0237915086ab8c8906b282b7a31de54c5154dab6d6735e402770761e626 +size 16659 diff --git a/output/o3-2025-04-16/toolace_single_func_call_2.parquet b/output/o3-2025-04-16/toolace_single_func_call_2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ef9579b7084565e5cf5f0a6215841f85f035c27b --- /dev/null +++ b/output/o3-2025-04-16/toolace_single_func_call_2.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27ccec9aff516c2fd8ac63da4c3d628b2540ae79257755611930b4d86d7acabd +size 10604 diff --git a/output/o3-2025-04-16/xlam_multiple_tool_multiple_call.parquet b/output/o3-2025-04-16/xlam_multiple_tool_multiple_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..924ec1169f3e380800d982497406618a13ecf6ce --- /dev/null +++ b/output/o3-2025-04-16/xlam_multiple_tool_multiple_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b46322e4ecc1cf3fc82c88a8cc0c49755d83b3e27fab8dfa217d86f658865e8e +size 89563 diff --git a/output/o3-2025-04-16/xlam_multiple_tool_single_call.parquet b/output/o3-2025-04-16/xlam_multiple_tool_single_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c64c64db505183ad5b6ed30cb5371cad5e108dce --- /dev/null +++ b/output/o3-2025-04-16/xlam_multiple_tool_single_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddad1d72d2ee7fa2a24c1f7cef3d916322063be136bcc125ebb31649a0a871ec +size 39668 diff --git a/output/o3-2025-04-16/xlam_single_tool_multiple_call.parquet b/output/o3-2025-04-16/xlam_single_tool_multiple_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..efdd9ca07bbe3adfd2c35a1bad4b92d1f2c41ec0 --- /dev/null +++ b/output/o3-2025-04-16/xlam_single_tool_multiple_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e853236c021c2a5d2e7bf4460161fec065a2d5b3f0317ecdaed60c5d6620c60 +size 29810 diff --git a/output/o3-2025-04-16/xlam_single_tool_single_call.parquet b/output/o3-2025-04-16/xlam_single_tool_single_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..80aa43fce6d9a8924c8467e4cb267572875eb182 --- /dev/null +++ b/output/o3-2025-04-16/xlam_single_tool_single_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0d8857da8661f49a93f3ce7778d89a018d7096d19f029a772747938d5ac6fd5 +size 52004 diff --git a/output/o3-2025-04-16/xlam_tool_miss.parquet b/output/o3-2025-04-16/xlam_tool_miss.parquet new file mode 100644 index 0000000000000000000000000000000000000000..79b5df5580cd46b57a7c4bb36d0792363711fca9 --- /dev/null +++ b/output/o3-2025-04-16/xlam_tool_miss.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6777dfd00ccc4ec1db9353a70ed8d840ed771c258ac132f9ff2b20b78833f328 +size 52400 diff --git a/output/o4-mini-2025-04-16/BFCL_v3_irrelevance.parquet b/output/o4-mini-2025-04-16/BFCL_v3_irrelevance.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ae8481506f54ec68217cb2ea6cdf379b5b05d829 --- /dev/null +++ b/output/o4-mini-2025-04-16/BFCL_v3_irrelevance.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3ccaa263af840f0586152208018fe613e1f868a3b94268fe23519f9209ba306 +size 49361 diff --git a/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_base_multi_func_call.parquet b/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_base_multi_func_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9850efbea9dc466f9046fc07cada300742593bfd --- /dev/null +++ b/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_base_multi_func_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8caccf724b21af65b12e7562730aa4e43503cdeafa1de941e500060f8a29f69f +size 23480 diff --git a/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_base_single_func_call.parquet b/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_base_single_func_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..dad1d2950b0753a710c67b0ddc1a4c3c7c480d05 --- /dev/null +++ b/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_base_single_func_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17898cad71f2737badfee4eb22fdab97103260c036cf7be70107476ab2b568c9 +size 21102 diff --git a/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_composite.parquet b/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_composite.parquet new file mode 100644 index 0000000000000000000000000000000000000000..be4afefb20af311ea33f87add2a7455533f1c3ca --- /dev/null +++ b/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_composite.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2a78322a4b0c6ebb039f1e7322d3273ae5b8c85d3314dd3c0976fa96d41758c +size 46160 diff --git a/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_long_context.parquet b/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_long_context.parquet new file mode 100644 index 0000000000000000000000000000000000000000..31d07d5593d7b0c6f284dd730efa1750b590991f --- /dev/null +++ b/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_long_context.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e2871c1c3303c004731510d84b93d25a1858f31853bf3199584c2add7cf8477 +size 36372 diff --git a/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_miss_func.parquet b/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_miss_func.parquet new file mode 100644 index 0000000000000000000000000000000000000000..372b93a5acd4d10853710faa2e6ca25f76c33008 --- /dev/null +++ b/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_miss_func.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:437e4ffa080ee0351584eda9a9081a845affb07af5db98a484547fa8fc6d7f3f +size 37102 diff --git a/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_miss_param.parquet b/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_miss_param.parquet new file mode 100644 index 0000000000000000000000000000000000000000..618736c4a84628e36fcc006ab240582eb80633e2 --- /dev/null +++ b/output/o4-mini-2025-04-16/BFCL_v3_multi_turn_miss_param.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2f4defe4c86bcbb68d9bb28198660f90790d340d50abe02a154e2e49b6a05cb +size 44158 diff --git a/output/o4-mini-2025-04-16/tau_long_context.parquet b/output/o4-mini-2025-04-16/tau_long_context.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b1bbbde78a530a6fddad97721062dbe8c260208f --- /dev/null +++ b/output/o4-mini-2025-04-16/tau_long_context.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d65f9320bd7aa34adab6d9b7c1188233241fdc68ee219f3cf54b2c0148876db +size 44613 diff --git a/output/o4-mini-2025-04-16/toolace_single_func_call_1.parquet b/output/o4-mini-2025-04-16/toolace_single_func_call_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..81ac7cde43a4ef7e4dcbf3756c3a9c2ac5f2a8ef --- /dev/null +++ b/output/o4-mini-2025-04-16/toolace_single_func_call_1.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e957bde2a17898d5e2f106c26b01170ba424a5b46825bdee2c3cbdfaf1b1c41 +size 18075 diff --git a/output/o4-mini-2025-04-16/toolace_single_func_call_2.parquet b/output/o4-mini-2025-04-16/toolace_single_func_call_2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..6339a73f5763df48fbd6af8cb35a71158eb4be10 --- /dev/null +++ b/output/o4-mini-2025-04-16/toolace_single_func_call_2.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c08df248517875bf225bec20014339f51e1e83f9cbce133e3d876e20ec0cbf3 +size 10774 diff --git a/output/o4-mini-2025-04-16/xlam_multiple_tool_multiple_call.parquet b/output/o4-mini-2025-04-16/xlam_multiple_tool_multiple_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..cfe29136fd2146bb366744968f0e614c59783b56 --- /dev/null +++ b/output/o4-mini-2025-04-16/xlam_multiple_tool_multiple_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69e9aa44caf84f1fae3c586f34c9036b00ac213b0e3a6d999ba1f2bfd29bc8ab +size 89609 diff --git a/output/o4-mini-2025-04-16/xlam_multiple_tool_single_call.parquet b/output/o4-mini-2025-04-16/xlam_multiple_tool_single_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e86d1d8a901e1edb0ad2c1864e38085a0fa0fd42 --- /dev/null +++ b/output/o4-mini-2025-04-16/xlam_multiple_tool_single_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d0f9f2ea967795e1617a70a68ed9394170f1998d45936de721906f2c9171399 +size 38580 diff --git a/output/o4-mini-2025-04-16/xlam_single_tool_multiple_call.parquet b/output/o4-mini-2025-04-16/xlam_single_tool_multiple_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..30f32f534ec260bb7d24d4ff2044e93ee12eca41 --- /dev/null +++ b/output/o4-mini-2025-04-16/xlam_single_tool_multiple_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:052fc4c75cba5ff1ba16a275bd9ecef6cd460a3a56bf4cfe35b4f2d88cc37e07 +size 31463 diff --git a/output/o4-mini-2025-04-16/xlam_single_tool_single_call.parquet b/output/o4-mini-2025-04-16/xlam_single_tool_single_call.parquet new file mode 100644 index 0000000000000000000000000000000000000000..dd4384993b7f3d1834379970d6b0b0f6d9756a8d --- /dev/null +++ b/output/o4-mini-2025-04-16/xlam_single_tool_single_call.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de673b082f6038ddb3b014691938ed2470bfd481382c0ef481d8ed47226f5746 +size 45538 diff --git a/output/o4-mini-2025-04-16/xlam_tool_miss.parquet b/output/o4-mini-2025-04-16/xlam_tool_miss.parquet new file mode 100644 index 0000000000000000000000000000000000000000..add91d36973ad9e41502a995afb7d6936369f0ca --- /dev/null +++ b/output/o4-mini-2025-04-16/xlam_tool_miss.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b040f497139e9a4012756549cd5f2273af75f6d929d4a83b4e920bd078ec0e9e +size 48720 diff --git a/results.csv b/results.csv index 96be807cdff505c4a8d089c4284464f190d522df..84928482fe0d8eef5f43f012b3ff72d9fcfbe2a0 100644 --- a/results.csv +++ b/results.csv @@ -3,21 +3,26 @@ claude-3-7-sonnet-20250219,Private,Reasoning,Anthropic,3,15,0.953,0.96,0.95,0.92 gemini-2.5-pro-preview-03-25,Private,Reasoning,Google,1.25,10,0.941,0.93,0.95,0.95,0.97,0.97,0.82,0.95,0.99,0.89,0.92,1,0.93,1,0.84,0.95,1 gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965 gemini-2.0-flash-lite-001,Private,Normal,Google,0.075,0.3,0.933,0.96,0.91,0.81,0.98,0.98,0.9,0.91,0.92,0.98,0.86,0.99,0.87,0.97,0.96,0.95,0.975 +gpt-4.1-2025-04-14,Private,Normal,OpenAI,2,8,0.918,0.94,0.89,0.85,0.88,0.94,0.97,0.88,0.99,0.75,0.87,0.97,0.95,0.99,0.98,0.86,0.965 mistral-small-2503,Open source,Normal,Mistral,0.1,0.3,0.912,0.93,0.89,0.85,0.93,0.86,0.91,0.9,1,0.83,0.81,0.99,0.87,0.99,0.95,0.9,0.975 +gpt-4.1-mini-2025-04-14,Private,Normal,OpenAI,0.4,1.6,0.910,0.93,0.89,0.9,0.85,0.93,0.98,0.92,0.99,0.71,0.93,0.94,0.94,1,0.98,0.75,0.925 deepseek-v3-0324,Open source,Normal,Deepseek,0.27,1.1,0.905,0.91,0.90,0.93,0.9,0.77,0.98,0.87,1,0.7,0.92,0.96,0.91,0.96,0.98,0.84,0.95 gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965 gpt-4.5-preview-2025-02-27,Private,Normal,OpenAI,75,150,0.900,0.93,0.87,0.85,0.91,0.92,0.97,0.92,0.99,0.67,0.85,0.98,0.85,1,0.98,0.8,0.915 gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99 +o4-mini-2025-04-16,Private,Reasoning,OpenAI,1.1,4.4,0.889,0.88,0.90,0.87,0.93,1,0.74,0.88,1,0.78,0.85,0.97,0.9,1,0.67,0.89,0.965 palmyra-x-004,Private,Normal,Writer,5,12,0.886,0.92,0.85,0.91,0.78,0.89,0.94,0.84,0.97,0.69,0.86,1,0.76,1,0.98,0.84,0.95 gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925 o1-2024-12-17,Private,Reasoning,OpenAI,15,60,0.876,0.83,0.92,0.89,0.92,0.98,0.71,0.91,0.99,0.73,0.88,0.98,0.96,1,0.43,0.94,0.95 amazon.nova-pro-v1,Private,Normal,Amazon,0.8,3.2,0.868,0.94,0.79,0.77,0.81,0.94,0.97,0.73,0.93,0.93,0.78,0.92,0.81,0.94,0.97,0.75,0.9 amazon.nova-lite-v1,Private,Normal,Amazon,0.06,0.24,0.868,0.91,0.83,0.83,0.87,0.83,0.9,0.9,0.93,0.91,0.75,0.94,0.74,0.88,0.96,0.78,0.925 +o3-2025-04-16,Private,Reasoning,OpenAI,10,40,0.861,0.86,0.86,0.75,0.88,0.95,0.74,0.86,0.99,0.77,0.79,0.96,0.88,0.99,0.64,0.9,0.95 o3-mini-2025-01-31,Private,Reasoning,OpenAI,1.1,4.4,0.847,0.80,0.90,0.87,0.91,0.84,0.72,0.93,0.98,0.63,0.85,0.97,0.84,1,0.43,0.91,0.975 gpt-4o-mini,Private,Normal,OpenAI,0.15,0.6,0.832,0.85,0.82,0.82,0.85,0.51,0.98,0.83,1,0.54,0.83,0.94,0.83,0.96,0.99,0.73,0.835 amazon.nova-micro-v1,Private,Normal,Amazon,0.035,0.14,0.829,0.90,0.75,0.77,0.79,0.8,0.97,0.69,0.87,0.89,0.74,0.93,0.68,0.91,0.96,0.7,0.91 qwen2.5-72b-instruct,Open source,Normal,Alibaba,0.9,0.9,0.817,0.80,0.84,0.84,0.87,0.92,0.63,0.86,0.99,0.66,0.79,0.99,0.77,0.97,0.42,0.78,0.95 mistral-large-2411,Private,Normal,Mistral,2,6,0.810,0.87,0.75,0.77,0.76,0.83,0.93,0.75,0.97,0.65,0.77,0.87,0.78,0.9,0.94,0.7,0.725 +gpt-4.1-nano-2025-04-14,Private,Normal,OpenAI,0.1,0.4,0.803,0.85,0.76,0.81,0.75,0.83,0.86,0.73,0.93,0.49,0.8,0.92,0.72,0.94,0.95,0.71,0.8 claude-3-5-sonnet-20241022,Private,Normal,Anthropic,3,15,0.801,0.83,0.77,0.68,0.81,0.68,0.78,0.85,0.91,0.92,0.67,0.9,0.75,0.74,0.88,0.69,0.955 Llama-3.3-70B-Instruct-Turbo,Open source,Normal,Meta,0.9,0.9,0.774,0.86,0.69,0.85,0.5,0.72,0.87,0.57,0.99,0.61,0.79,0.9,0.73,0.93,0.97,0.54,0.865 claude-3-5-haiku-20241022,Private,Normal,Anthropic,0.8,4,0.765,0.78,0.75,0.72,0.72,0.72,0.79,0.79,0.85,0.76,0.73,0.84,0.69,0.65,0.88,0.66,0.905 @@ -27,4 +32,4 @@ ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73, Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575 open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41 Llama-4-Scout-17B-16E-Instruct,Open source,Normal,Meta,0.18,0.59,0.629,0.69,0.57,0.73,0.51,0.74,0.94,0.51,0.93,0.25,0.71,0.2,0.72,0.81,0.94,0.49,0.33 -Dataset Avg,,,,,,,0.86,0.82,0.82,0.82,0.81,0.90,0.82,0.96,0.68,0.82,0.86,0.82,0.93,0.88,0.77,0.85 \ No newline at end of file +Dataset Avg,,,,,,,0.87,0.82,0.83,0.82,0.83,0.89,0.83,0.96,0.69,0.82,0.88,0.83,0.94,0.88,0.78,0.86 \ No newline at end of file