diff --git a/create_leaderboard.py b/create_leaderboard.py index 07ef86ca9cd7f5861566b7d128c087c2284368f1..839fa37b1853546e5017b5768777cbf13cb120ab 100644 --- a/create_leaderboard.py +++ b/create_leaderboard.py @@ -66,7 +66,10 @@ with gr.Blocks(title="DeepResearch Bench") as demo: Website | Paper | Eval Dataset | - Total models: 16 | Last Update: 28 May 2025 + Total models: 19 | Last Update: 29 Dec 2024
+ + Race judge model: gemini-2.5-pro | Fact-checking models: gemini-2.5-flash + """) diff --git a/data/data_viewer.jsonl b/data/data_viewer.jsonl index 5b61da5f814aa722858757c27b9b9ac5c2d9e2f6..8ba4d39ff095f435b49d964801d863e0d96a40b9 100644 --- a/data/data_viewer.jsonl +++ b/data/data_viewer.jsonl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7ab11f250f4ffd6bf9c74ff8dc1e68f86d7abbf4f6319164bb476177ad7bf6e -size 28044256 +oid sha256:647067a9eec626525fa41f257123b5b35f9daf6e9862467e9dc259f987ce621f +size 40834049 diff --git a/data/fact_results/claude-3-5-sonnet-with-search/fact_result.txt b/data/fact_results/claude-3-5-sonnet-with-search/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ff98b53590e941a8acd6b98dd3cbc66244f62f9 --- /dev/null +++ b/data/fact_results/claude-3-5-sonnet-with-search/fact_result.txt @@ -0,0 +1,3 @@ +total_citations: 9.94 +total_valid_citations: 9.35 +valid_rate: 0.9406438631790744 diff --git a/data/fact_results/claude-3-7-sonnet-with-search/fact_result.txt b/data/fact_results/claude-3-7-sonnet-with-search/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a37dedf9a28c0433bf2d1f131dcd6309f0e3b95 --- /dev/null +++ b/data/fact_results/claude-3-7-sonnet-with-search/fact_result.txt @@ -0,0 +1,3 @@ +total_citations: 28.07 +total_valid_citations: 24.51 +valid_rate: 0.8731742073387959 diff --git a/data/fact_results/doubao-deepresearch/fact_result.txt b/data/fact_results/doubao-deepresearch/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a47562576f9f95fd0d3d0b39df1711c8f0e704d --- /dev/null +++ b/data/fact_results/doubao-deepresearch/fact_result.txt @@ -0,0 +1,3 @@ +citations_per_task: 99.5510 +supported_per_task: 52.6224 +valid_rate: 0.5286 diff --git a/data/fact_results/gemini-2.5-flash-preview-04-17/fact_result.txt b/data/fact_results/gemini-2.5-flash-preview-04-17/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..befd1e7f66c36fef1bdd22c6d66a18689da0e707 --- /dev/null +++ b/data/fact_results/gemini-2.5-flash-preview-04-17/fact_result.txt @@ -0,0 +1 @@ +No tasks with valid results. diff --git a/data/fact_results/gemini-2.5-pro-deepresearch/fact_result.txt b/data/fact_results/gemini-2.5-pro-deepresearch/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..da40e8b7211a8f3869c7dcc7473e55f772bab7c8 --- /dev/null +++ b/data/fact_results/gemini-2.5-pro-deepresearch/fact_result.txt @@ -0,0 +1,3 @@ +citations_per_task: 211.1616 +supported_per_task: 165.3434 +valid_rate: 0.7830 diff --git a/data/fact_results/gemini-2.5-pro-preview-05-06/fact_result.txt b/data/fact_results/gemini-2.5-pro-preview-05-06/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..befd1e7f66c36fef1bdd22c6d66a18689da0e707 --- /dev/null +++ b/data/fact_results/gemini-2.5-pro-preview-05-06/fact_result.txt @@ -0,0 +1 @@ +No tasks with valid results. diff --git a/data/fact_results/gpt-4.1-mini/fact_result.txt b/data/fact_results/gpt-4.1-mini/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..665a51e8d338db73242be5ff3102c882d8cd7390 --- /dev/null +++ b/data/fact_results/gpt-4.1-mini/fact_result.txt @@ -0,0 +1,3 @@ +total_citations: 4.85 +total_valid_citations: 4.1 +valid_rate: 0.845360824742268 diff --git a/data/fact_results/gpt-4.1/fact_result.txt b/data/fact_results/gpt-4.1/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..b74a4e7f6d445d1dc8a80a1a3a192062accce06d --- /dev/null +++ b/data/fact_results/gpt-4.1/fact_result.txt @@ -0,0 +1,3 @@ +total_citations: 4.746987951807229 +total_valid_citations: 4.265060240963855 +valid_rate: 0.8984771573604061 diff --git a/data/fact_results/gpt-4o-mini-search-preview/fact_result.txt b/data/fact_results/gpt-4o-mini-search-preview/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..e221efe2487d53e3582ca9738dc74958dad2812d --- /dev/null +++ b/data/fact_results/gpt-4o-mini-search-preview/fact_result.txt @@ -0,0 +1,3 @@ +total_citations: 5.651162790697675 +total_valid_citations: 4.616279069767442 +valid_rate: 0.8168724279835391 diff --git a/data/fact_results/gpt-4o-search-preview/fact_result.txt b/data/fact_results/gpt-4o-search-preview/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ffbdcc8cd3713d8bcad82828caa4d9d3d120868 --- /dev/null +++ b/data/fact_results/gpt-4o-search-preview/fact_result.txt @@ -0,0 +1,3 @@ +total_citations: 5.825581395348837 +total_valid_citations: 5.046511627906977 +valid_rate: 0.8662674650698603 diff --git a/data/fact_results/grok-deeper-search/fact_result.txt b/data/fact_results/grok-deeper-search/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a9792615a44408ce67ff57f32674a8a625b7b29 --- /dev/null +++ b/data/fact_results/grok-deeper-search/fact_result.txt @@ -0,0 +1,3 @@ +total_citations: 11.741935483870968 +total_valid_citations: 8.580645161290322 +valid_rate: 0.7307692307692307 diff --git a/data/fact_results/openai-deepresearch/fact_result.txt b/data/fact_results/openai-deepresearch/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..2dd776b43995475dc5dbad6f7053521854f9117b --- /dev/null +++ b/data/fact_results/openai-deepresearch/fact_result.txt @@ -0,0 +1,3 @@ +total_citations: 53.04040404040404 +total_valid_citations: 39.78787878787879 +valid_rate: 0.7501428299371549 diff --git a/data/fact_results/perplexity-Research/fact_result.txt b/data/fact_results/perplexity-Research/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0af84e34bc98cc6d7dbab0d78eae80ad78b6ac3 --- /dev/null +++ b/data/fact_results/perplexity-Research/fact_result.txt @@ -0,0 +1,3 @@ +total_citations: 37.76 +total_valid_citations: 31.2 +valid_rate: 0.826271186440678 diff --git a/data/fact_results/sonar-pro/fact_result.txt b/data/fact_results/sonar-pro/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..74f4102aa61797806770d0e4c3ba576bf8335974 --- /dev/null +++ b/data/fact_results/sonar-pro/fact_result.txt @@ -0,0 +1,3 @@ +total_citations: 21.01 +total_valid_citations: 16.75 +valid_rate: 0.7972394098048549 diff --git a/data/fact_results/sonar-reasoning-pro/fact_result.txt b/data/fact_results/sonar-reasoning-pro/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..02d6e3e611db6c1a65ec3a1fad6c4bcc81ccebcd --- /dev/null +++ b/data/fact_results/sonar-reasoning-pro/fact_result.txt @@ -0,0 +1,3 @@ +total_citations: 20.78 +total_valid_citations: 9.39 +valid_rate: 0.45187680461982677 diff --git a/data/fact_results/sonar-reasoning/fact_result.txt b/data/fact_results/sonar-reasoning/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..535e9bc2665b13492f624d5d3622afbbfdfb79c6 --- /dev/null +++ b/data/fact_results/sonar-reasoning/fact_result.txt @@ -0,0 +1,3 @@ +total_citations: 25.43 +total_valid_citations: 13.37 +valid_rate: 0.525756979944947 diff --git a/data/fact_results/sonar/fact_result.txt b/data/fact_results/sonar/fact_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..15ed17aeb5fe191bec3ca90ab57f10cc3804f561 --- /dev/null +++ b/data/fact_results/sonar/fact_result.txt @@ -0,0 +1,3 @@ +total_citations: 13.97872340425532 +total_valid_citations: 10.680851063829786 +valid_rate: 0.7640791476407914 diff --git a/data/leaderboard.csv b/data/leaderboard.csv index facdb1b2f115eb9ea4627520110806a07ae91d9b..bd6ee4f900bf1f337adef416f8087da4b482d770 100644 --- a/data/leaderboard.csv +++ b/data/leaderboard.csv @@ -1,17 +1,20 @@ model,overall_score,comprehensiveness,insight,instruction_following,readability,citation_accuracy,effective_citations -gemini-2.5-pro-deepresearch,48.88,48.53,48.50,49.18,49.44,81.44,111.21 -openai-deepresearch,46.98,46.87,45.25,49.27,47.14,77.96,40.79 -perplexity-Research,42.25,40.69,39.39,46.40,44.28,90.24,31.26 -claude-3-7-sonnet-with-search,40.67,38.99,37.66,45.77,41.46,93.68,32.48 -grok-deeper-search,40.24,37.97,35.37,46.30,44.05,83.59,8.15 -perplexity-sonar-reasoning-pro,40.22,37.38,36.11,45.66,44.74,39.36,8.35 -perplexity-sonar-reasoning,40.18,37.14,36.73,45.15,44.35,48.67,11.34 -perplexity-sonar-pro,38.93,36.38,34.26,44.70,43.35,78.66,14.74 -gemini-2.5-pro-with-grounding,35.12,34.06,29.79,41.67,37.16,81.81,32.88 -gpt-4o-search-preview,35.10,31.99,27.57,43.17,41.23,88.41,4.79 -perplexity-sonar,34.54,30.95,27.51,42.33,41.60,74.42,8.67 -gpt-4.1-with-search,33.46,29.42,25.38,42.33,40.77,87.83,4.42 -gemini-2.5-flash-preview-04-17,32.39,31.63,26.73,38.82,34.48,81.92,31.08 -gpt-4o-mini-search-preview,31.55,27.38,22.64,40.67,39.91,84.98,4.95 -gpt-4.1-mini-with-search,30.26,26.05,20.75,39.65,39.33,84.58,4.35 -claude-3-5-sonnet-with-search,28.48,24.82,22.82,35.12,35.08,94.04,9.78 \ No newline at end of file +gemini-2.5-pro-deepresearch,48.92,48.45,48.30,49.29,49.77,78.30,165.34 +openai-deepresearch,46.45,46.46,43.73,49.39,47.22,75.01,39.79 +claude-research,45.00,45.34,42.79,47.58,44.66,-,- +doubao-deepresearch,44.34,44.84,40.56,47.95,44.69,52.86,52.62 +kimi-researcher,42.69,42.82,39.40,45.30,44.68,-,- +perplexity-Research,40.46,39.10,35.65,46.11,43.08,82.63,31.20 +grok-deeper-search,38.22,36.08,30.89,46.59,42.17,73.08,8.58 +sonar-reasoning-pro,37.76,34.96,31.65,44.93,42.42,45.19,9.39 +sonar-reasoning,37.75,34.73,32.59,44.42,42.39,52.58,13.37 +claude-3-7-sonnet-with-search,36.63,35.95,31.29,44.05,36.07,87.32,24.51 +sonar-pro,36.19,33.92,29.69,43.39,41.07,79.72,16.75 +gemini-2.5-pro-preview-05-06,31.90,31.75,24.61,40.24,32.76,-,- +gpt-4o-search-preview,30.74,27.81,20.44,41.01,37.60,86.63,5.05 +sonar,30.64,27.14,21.62,40.70,37.46,76.41,10.68 +gpt-4.1,29.31,25.59,18.42,40.63,36.49,89.85,4.27 +gemini-2.5-flash-preview-04-17,29.19,28.97,21.62,37.80,29.97,-,- +gpt-4o-mini-search-preview,27.62,24.24,16.62,38.59,35.27,81.69,4.62 +gpt-4.1-mini,26.62,22.86,15.39,38.18,34.49,84.54,4.10 +claude-3-5-sonnet-with-search,23.95,21.28,16.20,32.41,29.87,94.06,9.35 diff --git a/data/raw_data/claude-research.jsonl b/data/raw_data/claude-research.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5fcaceb4db2b3c38d494b42d5f47461651a7bc11 --- /dev/null +++ b/data/raw_data/claude-research.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:300f8dbc8242a5852bbe44098403f35fac1e4136e2274c93f0a3d659fee00d7f +size 1513379 diff --git a/data/raw_data/doubao-deepresearch.jsonl b/data/raw_data/doubao-deepresearch.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7a2e78037c006e549a862aa1e12a30b894497722 --- /dev/null +++ b/data/raw_data/doubao-deepresearch.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b9512cedf730486da486ba9d0ec305213ca8176bd00b93da788367f090717f2 +size 7451876 diff --git a/data/raw_data/gemini-2.5-flash-with-grounding.jsonl b/data/raw_data/gemini-2.5-flash-preview-04-17.jsonl similarity index 100% rename from data/raw_data/gemini-2.5-flash-with-grounding.jsonl rename to data/raw_data/gemini-2.5-flash-preview-04-17.jsonl diff --git a/data/raw_data/gemini-2.5-pro-deepresearch.jsonl b/data/raw_data/gemini-2.5-pro-deepresearch.jsonl index 06ebc42cb73fab6016d60139b4c774971103e52a..8fe301da9499c3feaacea4dd2732d3bbe5a7a1e0 100644 --- a/data/raw_data/gemini-2.5-pro-deepresearch.jsonl +++ b/data/raw_data/gemini-2.5-pro-deepresearch.jsonl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33c5d28e76595f22fae1b0fbbe2700958bfe707dafe53f7c5842d3067ccfddef -size 8523353 +oid sha256:3705106cc42a38b7c7bcd90e42aaec7f688a0d52179329e3074fcda99ea544e7 +size 8523153 diff --git a/data/raw_data/gemini-2.5-pro-with-grounding.jsonl b/data/raw_data/gemini-2.5-pro-preview-05-06.jsonl similarity index 100% rename from data/raw_data/gemini-2.5-pro-with-grounding.jsonl rename to data/raw_data/gemini-2.5-pro-preview-05-06.jsonl diff --git a/data/raw_data/gpt-4.1-mini-with-search.jsonl b/data/raw_data/gpt-4.1-mini.jsonl similarity index 100% rename from data/raw_data/gpt-4.1-mini-with-search.jsonl rename to data/raw_data/gpt-4.1-mini.jsonl diff --git a/data/raw_data/gpt-4.1-with-search.jsonl b/data/raw_data/gpt-4.1-with-search.jsonl deleted file mode 100644 index 03ca8b1b5a43fc274c4749e82c2e175ec226cc84..0000000000000000000000000000000000000000 --- a/data/raw_data/gpt-4.1-with-search.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e0228624b09e9d6c25c72156f4dd7f5702e3adcdd71a1f309094c2913eb50639 -size 492406 diff --git a/data/raw_data/gpt-4.1.jsonl b/data/raw_data/gpt-4.1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..19efed14017cd0d84f49e92b3814b3b6fe34ff72 --- /dev/null +++ b/data/raw_data/gpt-4.1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7611ff4a5cd8824aa711c9759d7dff02a990a57fbb36d699442484431bd5662 +size 492409 diff --git a/data/raw_data/grok-deeper-search.jsonl b/data/raw_data/grok-deeper-search.jsonl index 3af0f1a3a4366ea4f7f66c3972de52547d81795d..5a6b524d0b48b82851be7e3d9c965a8ed8ccafd9 100644 --- a/data/raw_data/grok-deeper-search.jsonl +++ b/data/raw_data/grok-deeper-search.jsonl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea6428dcf2e729d84f019c302fb3862a85cefbea08282b5ffcc5c400306ab077 -size 1149933 +oid sha256:f14c64de8c22d66b5a1c08af0cb0d829d9a4b671378a2952230b2219d258f0ba +size 1149833 diff --git a/data/raw_data/kimi-researcher.jsonl b/data/raw_data/kimi-researcher.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d04c644a1d1e13665c60cd395a7bde11474f4ba1 --- /dev/null +++ b/data/raw_data/kimi-researcher.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:868ac817d88b63a7a253ecf4439b85205ad1c49f2879f4b46f1a9a34d6cf804f +size 3773315 diff --git a/data/raw_data/openai-deepresearch.jsonl b/data/raw_data/openai-deepresearch.jsonl index 89f6322b4e7f791282100772ae2f47300509288a..8cb76d73b594224de123058b67d8a00d8c96970f 100644 --- a/data/raw_data/openai-deepresearch.jsonl +++ b/data/raw_data/openai-deepresearch.jsonl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77d31b8ea1abd9aa8e924303451dc6a0f334f2e9d4d61ec71847c4db004ac62a -size 6903938 +oid sha256:8a9dbbf7f18d8c985bc4d4f450089eb4bb73e77dbf7168a1bb4c81f811e06d84 +size 6903838 diff --git a/data/raw_data/perplexity-Research.jsonl b/data/raw_data/perplexity-Research.jsonl index 14b989cca9757ec346ad21bdf63a15077a02b5af..e00b167dfb2b581a40fb84982234195fb9933225 100644 --- a/data/raw_data/perplexity-Research.jsonl +++ b/data/raw_data/perplexity-Research.jsonl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f27cb31cbab84f60efc3286592e84690fd117355dd84f9e4a9299108245c2a5 -size 1747979 +oid sha256:0a3b855862c99f108abf97b9e402b43eb4d3376c3ec93c2e0a9c871b70d0736e +size 1747879 diff --git a/data/raw_data/perplexity-sonar-pro.jsonl b/data/raw_data/perplexity-sonar-pro.jsonl deleted file mode 100644 index 56c0d03f64c5cd25b4907cd7254d2930afefb2d0..0000000000000000000000000000000000000000 --- a/data/raw_data/perplexity-sonar-pro.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d577c0a208b35eb2c0454c00c70b12759cd8a1687f730f2133d8f392c1831ee -size 750234 diff --git a/data/raw_data/perplexity-sonar-reasoning-pro.jsonl b/data/raw_data/perplexity-sonar-reasoning-pro.jsonl deleted file mode 100644 index b649b3f834897f26017bb8c2515f629b2824473c..0000000000000000000000000000000000000000 --- a/data/raw_data/perplexity-sonar-reasoning-pro.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e08c6c4094bbf0aa1749e7b1a45e856a6635b2df6afdf0de8eeafea99e7477fc -size 495156 diff --git a/data/raw_data/perplexity-sonar.jsonl b/data/raw_data/perplexity-sonar.jsonl deleted file mode 100644 index 0052be2f8957ae0cef0794674113e9035e5c428c..0000000000000000000000000000000000000000 --- a/data/raw_data/perplexity-sonar.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dc0ef26282e404b700d56e158644f44228c49a3d5126fa12c8068e053444131e -size 574856 diff --git a/data/raw_data/sonar-pro.jsonl b/data/raw_data/sonar-pro.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1adaad480adc82c3156ded24a94583f41d1f7470 --- /dev/null +++ b/data/raw_data/sonar-pro.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39dbb2c1eae9fe1bc32abaedebe75bbc643ba18ec25e3360726ffb9d514c52ec +size 750234 diff --git a/data/raw_data/sonar-reasoning-pro.jsonl b/data/raw_data/sonar-reasoning-pro.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7fa5906943b0173e9ddc0f56fe3d88e8ed2fe60b --- /dev/null +++ b/data/raw_data/sonar-reasoning-pro.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffa11b21375455127385b57c69442a2772aa96f0d5b410e274d994eacf920c00 +size 495156 diff --git a/data/raw_data/perplexity-sonar-reasoning.jsonl b/data/raw_data/sonar-reasoning.jsonl similarity index 100% rename from data/raw_data/perplexity-sonar-reasoning.jsonl rename to data/raw_data/sonar-reasoning.jsonl diff --git a/data/raw_data/sonar.jsonl b/data/raw_data/sonar.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7e3544ee01fd82117e61e2f79610ea94b401916a --- /dev/null +++ b/data/raw_data/sonar.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96626a86dca8c67848e8ce9e71e76c4cddec7066ba209e2018d13781ad23f17f +size 574856 diff --git a/data/raw_results/claude-3-5-sonnet-with-search.jsonl b/data/raw_results/claude-3-5-sonnet-with-search.jsonl deleted file mode 100644 index 6cbf4daaed0703e6af45f79f84ec8b7de9b73551..0000000000000000000000000000000000000000 --- a/data/raw_results/claude-3-5-sonnet-with-search.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c0c47d1bab126886420bd53bb41a8905cdfb97f105711bcc2f5a27e3d53652ea -size 1992421 diff --git a/data/raw_results/claude-3-5-sonnet-with-search/race_result.txt b/data/raw_results/claude-3-5-sonnet-with-search/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..4950288405425e05e3e491e7715c091f2e813b47 --- /dev/null +++ b/data/raw_results/claude-3-5-sonnet-with-search/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.2128 +Insight: 0.1620 +Instruction Following: 0.3241 +Readability: 0.2987 +Overall Score: 0.2395 diff --git a/data/raw_results/claude-3-5-sonnet-with-search/raw_results.jsonl b/data/raw_results/claude-3-5-sonnet-with-search/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..48e8689fcb74e25c0b58a8b6b66455c5abfa4898 --- /dev/null +++ b/data/raw_results/claude-3-5-sonnet-with-search/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0423e9b7888c1070e43272284414d91deb490e38d48d0f7d6afc004db6fea291 +size 52686 diff --git a/data/raw_results/claude-3-7-sonnet-with-search.jsonl b/data/raw_results/claude-3-7-sonnet-with-search.jsonl deleted file mode 100644 index 4d4b9e9e111c47cc90050482a374c1c1ddfb3893..0000000000000000000000000000000000000000 --- a/data/raw_results/claude-3-7-sonnet-with-search.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2b3a6bf74400c89d24fa47853ab034ed3696ee0694c2d190ba83c3f5dcd8a0ef -size 2002379 diff --git a/data/raw_results/claude-3-7-sonnet-with-search/race_result.txt b/data/raw_results/claude-3-7-sonnet-with-search/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..eae703902d64d312abd5e4078c10b9ec0aff474e --- /dev/null +++ b/data/raw_results/claude-3-7-sonnet-with-search/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.3595 +Insight: 0.3129 +Instruction Following: 0.4405 +Readability: 0.3607 +Overall Score: 0.3663 diff --git a/data/raw_results/claude-3-7-sonnet-with-search/raw_results.jsonl b/data/raw_results/claude-3-7-sonnet-with-search/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4ef93f616f5d3e24b0916d49b1482211c320293e --- /dev/null +++ b/data/raw_results/claude-3-7-sonnet-with-search/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fde53e238b0f658ccc09d1efb7d6a13e276cd9d67e9e722adc5531dc1561853 +size 52517 diff --git a/data/raw_results/claude-research/race_result.txt b/data/raw_results/claude-research/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fb132c7a92322f3f021dec1453377cd9dfa7cb1 --- /dev/null +++ b/data/raw_results/claude-research/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.4534 +Insight: 0.4279 +Instruction Following: 0.4758 +Readability: 0.4466 +Overall Score: 0.4500 diff --git a/data/raw_results/claude-research/raw_results.jsonl b/data/raw_results/claude-research/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..24868ca43ca1a8b7334cb535810ff3809c643d43 --- /dev/null +++ b/data/raw_results/claude-research/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6616b89c3b38eb5f4822cdf82326648652c3fb19b5aca5ada99552cba4f529 +size 52454 diff --git a/data/raw_results/doubao-deepresearch/race_result.txt b/data/raw_results/doubao-deepresearch/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd6f591e6b8c979d9b071ca503a001110cad00a3 --- /dev/null +++ b/data/raw_results/doubao-deepresearch/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.4484 +Insight: 0.4056 +Instruction Following: 0.4795 +Readability: 0.4469 +Overall Score: 0.4434 diff --git a/data/raw_results/doubao-deepresearch/raw_results.jsonl b/data/raw_results/doubao-deepresearch/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6c93261832f7be8b84d3e4b6339870a56865038c --- /dev/null +++ b/data/raw_results/doubao-deepresearch/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:732d86bd51bfb687821590ecc227cbce30663d309069e812d6e4bba103ca5e2a +size 51890 diff --git a/data/raw_results/gemini-2.5-flash-preview-04-17/race_result.txt b/data/raw_results/gemini-2.5-flash-preview-04-17/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e84bbaeced824bc43630875b1ae6f8b582b4a3e --- /dev/null +++ b/data/raw_results/gemini-2.5-flash-preview-04-17/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.2897 +Insight: 0.2162 +Instruction Following: 0.3780 +Readability: 0.2997 +Overall Score: 0.2919 diff --git a/data/raw_results/gemini-2.5-flash-preview-04-17/raw_results.jsonl b/data/raw_results/gemini-2.5-flash-preview-04-17/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2b7d939a134e08cd34755c66e812adec21e4d684 --- /dev/null +++ b/data/raw_results/gemini-2.5-flash-preview-04-17/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ee20445df1bc4ca52e8c0de0d8f02d65e808df8f834b789fed749004e2bc3a4 +size 52637 diff --git a/data/raw_results/gemini-2.5-flash-with-grounding.jsonl b/data/raw_results/gemini-2.5-flash-with-grounding.jsonl deleted file mode 100644 index 06b805115fa66dfdac2aa885a6d5fe5d09129c37..0000000000000000000000000000000000000000 --- a/data/raw_results/gemini-2.5-flash-with-grounding.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:43b9f71819babb5c00f65f0dd71d707323fb803c585bd74976f49cdc34ab80aa -size 1951481 diff --git a/data/raw_results/gemini-2.5-pro-deepresearch.jsonl b/data/raw_results/gemini-2.5-pro-deepresearch.jsonl deleted file mode 100644 index d0cfc1e3352805fd32bbec4397b7945440da9337..0000000000000000000000000000000000000000 --- a/data/raw_results/gemini-2.5-pro-deepresearch.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ac2fc53c99697e3276c98d735ed630df6fa49d2972c70a5409adc1958ecaa7b7 -size 1937730 diff --git a/data/raw_results/gemini-2.5-pro-deepresearch/race_result.txt b/data/raw_results/gemini-2.5-pro-deepresearch/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..3462a5d40845565c4e6651cb43984d73ea161c3f --- /dev/null +++ b/data/raw_results/gemini-2.5-pro-deepresearch/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.4845 +Insight: 0.4830 +Instruction Following: 0.4929 +Readability: 0.4977 +Overall Score: 0.4892 diff --git a/data/raw_results/gemini-2.5-pro-deepresearch/raw_results.jsonl b/data/raw_results/gemini-2.5-pro-deepresearch/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4eecca39d953880d6f9b4dc909a027df9d967d65 --- /dev/null +++ b/data/raw_results/gemini-2.5-pro-deepresearch/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34e1afac1851c1e81b65f1f3844aa8da886ef20558d7891ce7145f7c63cc53ca +size 51986 diff --git a/data/raw_results/gemini-2.5-pro-preview-05-06/race_result.txt b/data/raw_results/gemini-2.5-pro-preview-05-06/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..e34a15ab4d1fbf3ac85495ec1e1c29fbdcc4fe2d --- /dev/null +++ b/data/raw_results/gemini-2.5-pro-preview-05-06/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.3175 +Insight: 0.2461 +Instruction Following: 0.4024 +Readability: 0.3276 +Overall Score: 0.3190 diff --git a/data/raw_results/gemini-2.5-pro-preview-05-06/raw_results.jsonl b/data/raw_results/gemini-2.5-pro-preview-05-06/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d739db7b21b5b2d01d394c45df9e15ebcd9ea358 --- /dev/null +++ b/data/raw_results/gemini-2.5-pro-preview-05-06/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce921d7368ee00c8a2d573331fd42e5b813af29ecfc367048cdb3b1367ff3e43 +size 52566 diff --git a/data/raw_results/gemini-2.5-pro-with-grounding.jsonl b/data/raw_results/gemini-2.5-pro-with-grounding.jsonl deleted file mode 100644 index 64b458d9c012cecfd581bbef650f5a5fea526a8d..0000000000000000000000000000000000000000 --- a/data/raw_results/gemini-2.5-pro-with-grounding.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5e911a18cf8b8a8207eb45584ac650e4640f79db7352055ca5e92356de37f911 -size 1944815 diff --git a/data/raw_results/gpt-4.1-mini-with-search.jsonl b/data/raw_results/gpt-4.1-mini-with-search.jsonl deleted file mode 100644 index afb748e6922224b0767dd4944dc7ff0e242c118b..0000000000000000000000000000000000000000 --- a/data/raw_results/gpt-4.1-mini-with-search.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:948a403d12bcf6b0e3ce6664f83afeb95413684ab0b7912003ed756a4df15c5e -size 1992345 diff --git a/data/raw_results/gpt-4.1-mini/race_result.txt b/data/raw_results/gpt-4.1-mini/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..f3954c80e00f551808c7674a9b48137359576c68 --- /dev/null +++ b/data/raw_results/gpt-4.1-mini/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.2286 +Insight: 0.1539 +Instruction Following: 0.3818 +Readability: 0.3449 +Overall Score: 0.2662 diff --git a/data/raw_results/gpt-4.1-mini/raw_results.jsonl b/data/raw_results/gpt-4.1-mini/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..14bc78e7ce47cacee0957c251d66cdad703eb2fa --- /dev/null +++ b/data/raw_results/gpt-4.1-mini/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b47556452f7b784c3c4f6d730e058937706fa5adeea4aeb93a30a710dfa6412 +size 52638 diff --git a/data/raw_results/gpt-4.1-with-search.jsonl b/data/raw_results/gpt-4.1-with-search.jsonl deleted file mode 100644 index a2b0e22d3e3e4d03bca5442dbbdf52fe402705d8..0000000000000000000000000000000000000000 --- a/data/raw_results/gpt-4.1-with-search.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:908a5989af337e381bf2bce6795438edd21966f313b5194f532feb1f47e5b812 -size 2090582 diff --git a/data/raw_results/gpt-4.1/race_result.txt b/data/raw_results/gpt-4.1/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc508b619460f8620a860b0a82c4cacb67867fae --- /dev/null +++ b/data/raw_results/gpt-4.1/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.2559 +Insight: 0.1842 +Instruction Following: 0.4063 +Readability: 0.3649 +Overall Score: 0.2931 diff --git a/data/raw_results/gpt-4.1/raw_results.jsonl b/data/raw_results/gpt-4.1/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..adad3e3f4c8f7671c0fb472fbf17f00bf6da6b33 --- /dev/null +++ b/data/raw_results/gpt-4.1/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1a7903c48fb04860b0bb1718ce7004d492dbab5068545d2a321b80c5f16f711 +size 52618 diff --git a/data/raw_results/gpt-4o-mini-search-preview.jsonl b/data/raw_results/gpt-4o-mini-search-preview.jsonl deleted file mode 100644 index 7279175fb94fb3cdb888c7285edeebf9d1967f07..0000000000000000000000000000000000000000 --- a/data/raw_results/gpt-4o-mini-search-preview.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4277a9a91fcdaaeff1afe948c1088095d5f01092404fcd1a62407b7a58b7906e -size 2074673 diff --git a/data/raw_results/gpt-4o-mini-search-preview/race_result.txt b/data/raw_results/gpt-4o-mini-search-preview/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..d43e0dca99168528d11ce47febd36d539fba116e --- /dev/null +++ b/data/raw_results/gpt-4o-mini-search-preview/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.2424 +Insight: 0.1662 +Instruction Following: 0.3859 +Readability: 0.3527 +Overall Score: 0.2762 diff --git a/data/raw_results/gpt-4o-mini-search-preview/raw_results.jsonl b/data/raw_results/gpt-4o-mini-search-preview/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..163c29b07451c25438c0ab9048bfd0cf5b4d63ef --- /dev/null +++ b/data/raw_results/gpt-4o-mini-search-preview/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34ec46e38b055a970b4086fb2db9cfa98330793add7ff2757c02acc55de68b32 +size 52637 diff --git a/data/raw_results/gpt-4o-search-preview.jsonl b/data/raw_results/gpt-4o-search-preview.jsonl deleted file mode 100644 index 58cae9bc4dfa1f4d9d8aff2f7f388c399c34f14b..0000000000000000000000000000000000000000 --- a/data/raw_results/gpt-4o-search-preview.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7adcd70d49d3b5dd6050201aa4fcd31f51288945f4a23de14432a301cbf295a7 -size 2063854 diff --git a/data/raw_results/gpt-4o-search-preview/race_result.txt b/data/raw_results/gpt-4o-search-preview/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..397872a5c7b26a3ed633af238378e409518e2978 --- /dev/null +++ b/data/raw_results/gpt-4o-search-preview/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.2781 +Insight: 0.2044 +Instruction Following: 0.4101 +Readability: 0.3760 +Overall Score: 0.3074 diff --git a/data/raw_results/gpt-4o-search-preview/raw_results.jsonl b/data/raw_results/gpt-4o-search-preview/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..37fc5c1304f6fed1283b08e9a34196dca57ee537 --- /dev/null +++ b/data/raw_results/gpt-4o-search-preview/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6ad7042da675dccdf90749986ce95fdc0f9bc2df2b3cd4db50bff4e70ff3ab5 +size 52617 diff --git a/data/raw_results/grok-deeper-search.jsonl b/data/raw_results/grok-deeper-search.jsonl deleted file mode 100644 index 00b5fe934660df3f6a024886f4e51e5dfac0ed94..0000000000000000000000000000000000000000 --- a/data/raw_results/grok-deeper-search.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b19fb7ec93872317eae94abeb02ed9c19912057acfa82600167ca853b750f476 -size 1968989 diff --git a/data/raw_results/grok-deeper-search/race_result.txt b/data/raw_results/grok-deeper-search/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..db282b350dcbccac12e458645ce174165eda99a1 --- /dev/null +++ b/data/raw_results/grok-deeper-search/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.3608 +Insight: 0.3089 +Instruction Following: 0.4659 +Readability: 0.4217 +Overall Score: 0.3822 diff --git a/data/raw_results/grok-deeper-search/raw_results.jsonl b/data/raw_results/grok-deeper-search/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2071550228b1407a7879d5c60d9e5f1f58b492a6 --- /dev/null +++ b/data/raw_results/grok-deeper-search/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56d1a924942aadf2cbab3fb718d654cdb25de59383991cbf340c58eef0671abd +size 52491 diff --git a/data/raw_results/kimi-researcher/race_result.txt b/data/raw_results/kimi-researcher/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..67b954e0ada304d7d2de4e25ac6233d6250f8373 --- /dev/null +++ b/data/raw_results/kimi-researcher/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.4282 +Insight: 0.3940 +Instruction Following: 0.4530 +Readability: 0.4468 +Overall Score: 0.4269 diff --git a/data/raw_results/kimi-researcher/raw_results.jsonl b/data/raw_results/kimi-researcher/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9d7841429343616a2cde51a98b7ab6470496ec2b --- /dev/null +++ b/data/raw_results/kimi-researcher/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2117bd7eb0cc91e705d64cb3013b2bf89cffce190f57b0fde5638a2efd6f027d +size 52510 diff --git a/data/raw_results/openai-deepresearch.jsonl b/data/raw_results/openai-deepresearch.jsonl deleted file mode 100644 index 603263d5b3d798707cc8950b76c5877c18943e31..0000000000000000000000000000000000000000 --- a/data/raw_results/openai-deepresearch.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ae45c25f5b5c56a772331543e4eefe7c80e63f33b441dfe83cb4a5c830c88a35 -size 2007501 diff --git a/data/raw_results/openai-deepresearch/race_result.txt b/data/raw_results/openai-deepresearch/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..94181c515d754b55aaf8be6f01ec2cecfdc9b64a --- /dev/null +++ b/data/raw_results/openai-deepresearch/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.4646 +Insight: 0.4373 +Instruction Following: 0.4939 +Readability: 0.4722 +Overall Score: 0.4645 diff --git a/data/raw_results/openai-deepresearch/raw_results.jsonl b/data/raw_results/openai-deepresearch/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e255f35c53b9be22e7c755e0a5d260d3f1c082c9 --- /dev/null +++ b/data/raw_results/openai-deepresearch/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ba1919ed5dc8b57e65aaabfec2bfaf8cbacd9b5ac69ca753de6110c21a100e7 +size 52313 diff --git a/data/raw_results/perplexity-Research.jsonl b/data/raw_results/perplexity-Research.jsonl deleted file mode 100644 index 7aaab6c2c3979022f50cf4b639be8de8a36ae40e..0000000000000000000000000000000000000000 --- a/data/raw_results/perplexity-Research.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b7715271d17cc344873653464ae3fef884e0f3c6bec89deee347ed7a0651beb9 -size 2030483 diff --git a/data/raw_results/perplexity-Research/race_result.txt b/data/raw_results/perplexity-Research/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..29c6e53812660ef84379b87f7a11c8e29fca47a5 --- /dev/null +++ b/data/raw_results/perplexity-Research/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.3910 +Insight: 0.3565 +Instruction Following: 0.4611 +Readability: 0.4308 +Overall Score: 0.4046 diff --git a/data/raw_results/perplexity-Research/raw_results.jsonl b/data/raw_results/perplexity-Research/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0a7838b67822ce1a5bbb0ac9765cb7c60bb24549 --- /dev/null +++ b/data/raw_results/perplexity-Research/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1141aa12a92d2026982cd899799ee773ece898211097f5e5cb3bdadf91a9a199 +size 52489 diff --git a/data/raw_results/perplexity-sonar-pro.jsonl b/data/raw_results/perplexity-sonar-pro.jsonl deleted file mode 100644 index 26188e17b7dd8c314c23e1022dc93c2a4ac581fe..0000000000000000000000000000000000000000 --- a/data/raw_results/perplexity-sonar-pro.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a453f5b29492f684f53364121e7c79eeb81aee2737a383e2748830a4e4453afb -size 1975770 diff --git a/data/raw_results/perplexity-sonar-reasoning-pro.jsonl b/data/raw_results/perplexity-sonar-reasoning-pro.jsonl deleted file mode 100644 index de695f14d14d614715ff9b2acb6ad6fd4801f506..0000000000000000000000000000000000000000 --- a/data/raw_results/perplexity-sonar-reasoning-pro.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:669a4a14232c63c716de766af7be050f8712f74a6d5437cc8fa637ded39f3c40 -size 1957092 diff --git a/data/raw_results/perplexity-sonar-reasoning.jsonl b/data/raw_results/perplexity-sonar-reasoning.jsonl deleted file mode 100644 index ca14b740222c630835f04801a5f485cf3fa22fb2..0000000000000000000000000000000000000000 --- a/data/raw_results/perplexity-sonar-reasoning.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bceb5637a9d0092af5ddcca49557a4f8f3604be9ebb430be32e820fa4d6723b3 -size 1951258 diff --git a/data/raw_results/perplexity-sonar.jsonl b/data/raw_results/perplexity-sonar.jsonl deleted file mode 100644 index 240df6ffd5c6e230e1e22264cd4f476fe8634b8a..0000000000000000000000000000000000000000 --- a/data/raw_results/perplexity-sonar.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:36ecd1540447863f66bfe1a43905070f9c9b0d40de803348c3450a396df3d8fc -size 2016838 diff --git a/data/raw_results/sonar-pro/race_result.txt b/data/raw_results/sonar-pro/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac3847542f354f41d0d0644cf29a38e4569e4e88 --- /dev/null +++ b/data/raw_results/sonar-pro/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.3392 +Insight: 0.2969 +Instruction Following: 0.4339 +Readability: 0.4107 +Overall Score: 0.3619 diff --git a/data/raw_results/sonar-pro/raw_results.jsonl b/data/raw_results/sonar-pro/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bd0804de5cacfba2df10d1090b90977aab7cdc4d --- /dev/null +++ b/data/raw_results/sonar-pro/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4badd5671a8bccf51ad1ad1f4617925abed30815153ecaa1b8eb3ceec4f46834 +size 52573 diff --git a/data/raw_results/sonar-reasoning-pro/race_result.txt b/data/raw_results/sonar-reasoning-pro/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..86ccf604bd5843241260b3ab34fbcff1055b16f9 --- /dev/null +++ b/data/raw_results/sonar-reasoning-pro/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.3496 +Insight: 0.3165 +Instruction Following: 0.4493 +Readability: 0.4242 +Overall Score: 0.3776 diff --git a/data/raw_results/sonar-reasoning-pro/raw_results.jsonl b/data/raw_results/sonar-reasoning-pro/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ae08519af3511e23a7234999f0022b1b24c5b7e4 --- /dev/null +++ b/data/raw_results/sonar-reasoning-pro/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:416a07a69a027d6dff5985260f8dffa2af7ea87456ca93e5fe33e003b68da15e +size 52469 diff --git a/data/raw_results/sonar-reasoning/race_result.txt b/data/raw_results/sonar-reasoning/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d52936c03f6f313f4c0f6c2ff201b71942a6c0c --- /dev/null +++ b/data/raw_results/sonar-reasoning/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.3473 +Insight: 0.3259 +Instruction Following: 0.4442 +Readability: 0.4239 +Overall Score: 0.3775 diff --git a/data/raw_results/sonar-reasoning/raw_results.jsonl b/data/raw_results/sonar-reasoning/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c6a5127accc7c79ae3e1a57e56bdcda6cfe2e553 --- /dev/null +++ b/data/raw_results/sonar-reasoning/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:930e0af6b6ac7a97c4bd4dc55b9c95aab5e14a630c887fd69a69d58e367d667f +size 52498 diff --git a/data/raw_results/sonar/race_result.txt b/data/raw_results/sonar/race_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d7a2d099e0d187f205be03c261c67a30afc64c7 --- /dev/null +++ b/data/raw_results/sonar/race_result.txt @@ -0,0 +1,5 @@ +Comprehensiveness: 0.2714 +Insight: 0.2162 +Instruction Following: 0.4070 +Readability: 0.3746 +Overall Score: 0.3064 diff --git a/data/raw_results/sonar/raw_results.jsonl b/data/raw_results/sonar/raw_results.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..29214cf9cbee22e72728a90be45d87e0c00a36f0 --- /dev/null +++ b/data/raw_results/sonar/raw_results.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9c0b00be855c7d8372c6b3c0e14f242e96f96e193ebefb06fd8176d63633678 +size 52590 diff --git a/tabs/leaderboard_tab.py b/tabs/leaderboard_tab.py index 01747eee4767f88cedea4798bb57c2382e6f7d66..5b46e1f91b47b5bfa9f81c6c3c88ab560e285405 100644 --- a/tabs/leaderboard_tab.py +++ b/tabs/leaderboard_tab.py @@ -29,21 +29,24 @@ MODEL_CATEGORIES = { "gemini-2.5-pro-deepresearch", "grok-deeper-search", "openai-deepresearch", - "perplexity-Research" + "perplexity-Research", + "doubao-deepresearch", + "kimi-researcher", + "claude-research" ], "LLM with Search": [ "claude-3-7-sonnet-with-search", - "perplexity-sonar-reasoning-pro", - "perplexity-sonar-reasoning", - "perplexity-sonar-pro", - "gemini-2.5-pro-with-grounding", + "claude-3-5-sonnet-with-search", + "sonar-reasoning-pro", + "sonar-reasoning", + "sonar-pro", + "sonar", + "gemini-2.5-pro-preview-05-06", "gpt-4o-search-preview", - "perplexity-sonar", - "gpt-4.1-with-search", + "gpt-4.1", "gemini-2.5-flash-preview-04-17", "gpt-4o-mini-search-preview", - "gpt-4.1-mini-with-search", - "claude-3-5-sonnet-with-search" + "gpt-4.1-mini" ] } diff --git a/utils/merge_raw_data.py b/utils/merge_raw_data.py index 0c46dfc8762cda0ed8af5d4801a359f09860b844..487d00c4a19deba1675239a9a097eb5af4164a29 100644 --- a/utils/merge_raw_data.py +++ b/utils/merge_raw_data.py @@ -6,76 +6,53 @@ import os from pathlib import Path -def calculate_dimension_score(target_score, reference_score): - """计算单个维度的分数,与rank_leaderboard.py中的逻辑一致""" - if (target_score + reference_score) == 0: # 避免除以零 - return 0.0 - return target_score / (target_score + reference_score) - - -def load_scores_for_model(model_results_file_path: Path): - """为单个模型加载所有文章的评分数据""" +def load_scores_for_model(model_results_dir: Path): scores_by_id = {} - if not model_results_file_path.exists(): - print(f"警告: 未找到模型 {model_results_file_path.stem} 的结果文件: {model_results_file_path}") + raw_results_file = model_results_dir / "raw_results.jsonl" + + if not raw_results_file.exists(): + print(f"警告: 未找到模型 {model_results_dir.name} 的结果文件: {raw_results_file}") return scores_by_id - print(f" 正在从 {model_results_file_path.name} 加载分数...") - with open(model_results_file_path, 'r', encoding='utf-8') as f: + print(f" 正在从 {model_results_dir.name}/raw_results.jsonl 加载分数...") + with open(raw_results_file, 'r', encoding='utf-8') as f: for i, line in enumerate(f): try: data = json.loads(line.strip()) - article_id = str(data.get('id')) # 确保ID是字符串以供匹配 + article_id = str(data.get('id')) if not article_id: - print(f" 警告: {model_results_file_path.name} 第 {i+1} 行缺少ID,已跳过。") + print(f" 警告: {model_results_dir.name} 第 {i+1} 行缺少ID,已跳过。") continue - # 直接获取 overall_score (原始值,假设在0-1范围,或者已经是0-100,根据您的数据调整) - # 根据您之前的修改,这里我们假设原始overall_score需要乘以100 overall_score_raw = data.get('overall_score', 0.0) overall_score_scaled = overall_score_raw * 100 - # 计算四个维度的分数 - comp_score_raw = calculate_dimension_score( - data.get('target_comprehensiveness_weighted_avg', 0), - data.get('reference_comprehensiveness_weighted_avg', 0) - ) - insight_score_raw = calculate_dimension_score( - data.get('target_insight_weighted_avg', 0), - data.get('reference_insight_weighted_avg', 0) - ) - instruction_score_raw = calculate_dimension_score( - data.get('target_instruction_following_weighted_avg', 0), - data.get('reference_instruction_following_weighted_avg', 0) - ) - readability_score_raw = calculate_dimension_score( - data.get('target_readability_weighted_avg', 0), - data.get('reference_readability_weighted_avg', 0) - ) + comprehensiveness_score_raw = data.get('comprehensiveness', 0.0) + insight_score_raw = data.get('insight', 0.0) + instruction_score_raw = data.get('instruction_following', 0.0) + readability_score_raw = data.get('readability', 0.0) scores_by_id[article_id] = { 'overall_score': f"{overall_score_scaled:.2f}", - 'comprehensiveness_score': f"{comp_score_raw * 100:.2f}", + 'comprehensiveness_score': f"{comprehensiveness_score_raw * 100:.2f}", 'insight_score': f"{insight_score_raw * 100:.2f}", 'instruction_following_score': f"{instruction_score_raw * 100:.2f}", 'readability_score': f"{readability_score_raw * 100:.2f}" } except json.JSONDecodeError as e: - print(f" 错误: 解析JSON时出错 (文件: {model_results_file_path.name}, 行: {i+1}): {e}") + print(f" 错误: 解析JSON时出错 (文件: {model_results_dir.name}, 行: {i+1}): {e}") except Exception as e: - print(f" 错误: 处理数据时出错 (文件: {model_results_file_path.name}, 行: {i+1}): {e}") - print(f" 为模型 {model_results_file_path.stem} 加载了 {len(scores_by_id)}篇文章的分数") + print(f" 错误: 处理数据时出错 (文件: {model_results_dir.name}, 行: {i+1}): {e}") + print(f" 为模型 {model_results_dir.name} 加载了 {len(scores_by_id)}篇文章的分数") return scores_by_id def merge_jsonl_files(): - # 定义目录路径 project_root = Path(__file__).resolve().parent.parent - raw_data_dir = project_root / "data" / "raw_data" # 包含原始文章内容的目录 - raw_results_dir = project_root / "data" / "raw_results" # 包含评分结果的目录 + raw_data_dir = project_root / "data" / "raw_data" + raw_results_dir = project_root / "data" / "raw_results" output_file = project_root / "data" / "data_viewer.jsonl" - # 获取所有原始数据JSONL文件 input_files = list(raw_data_dir.glob("*.jsonl")) print(f"在 {raw_data_dir} 中找到 {len(input_files)} 个模型JSONL文件") @@ -83,9 +60,8 @@ def merge_jsonl_files(): print("未找到任何原始数据文件,已退出。") return - # 清空输出文件 (如果需要,或者可以采用追加模式,但通常合并操作会重新生成) with open(output_file, 'w', encoding='utf-8') as f: - pass # 创建或清空文件 + pass all_merged_data = [] @@ -93,22 +69,24 @@ def merge_jsonl_files(): model_name = raw_data_file.stem print(f"正在处理原始数据文件: {raw_data_file.name} (模型: {model_name})") - # 为当前模型加载评分数据 - model_results_file = raw_results_dir / f"{model_name}.jsonl" - scores_for_current_model = load_scores_for_model(model_results_file) + model_results_dir = raw_results_dir / model_name + if not model_results_dir.exists(): + print(f" 警告: 未找到模型 {model_name} 对应的结果文件夹: {model_results_dir}") + continue + + scores_for_current_model = load_scores_for_model(model_results_dir) processed_articles_count = 0 with open(raw_data_file, 'r', encoding='utf-8') as f_raw: for i, line in enumerate(f_raw): try: article_data = json.loads(line.strip()) - article_id = str(article_data.get('id')) # 确保ID是字符串 + article_id = str(article_data.get('id')) if not article_id: print(f" 警告: {raw_data_file.name} 第 {i+1} 行缺少ID,已跳过。") continue - # 从加载的评分数据中获取该文章的评分 article_scores = scores_for_current_model.get(article_id, {}) if not article_scores: print(f" 警告: 模型 {model_name} 的文章ID {article_id} 未在结果文件中找到分数。") @@ -118,7 +96,7 @@ def merge_jsonl_files(): 'id': article_id, 'prompt': article_data.get('prompt'), 'article': article_data.get('article'), - 'overall_score': article_scores.get('overall_score'), # 可能为None + 'overall_score': article_scores.get('overall_score'), 'comprehensiveness_score': article_scores.get('comprehensiveness_score'), 'insight_score': article_scores.get('insight_score'), 'instruction_following_score': article_scores.get('instruction_following_score'), @@ -132,7 +110,6 @@ def merge_jsonl_files(): print(f" 错误: 处理原始数据时出错 (文件: {raw_data_file.name}, 行: {i+1}): {e}") print(f" 为模型 {model_name} 处理了 {processed_articles_count} 篇文章数据。") - # 一次性写入所有合并后的数据 with open(output_file, 'w', encoding='utf-8') as f_out: for item in all_merged_data: f_out.write(json.dumps(item, ensure_ascii=False) + '\n') diff --git a/utils/rank_leaderboard.py b/utils/rank_leaderboard.py index 86a2754e949f85cb23d98af5593d2d7beada91ce..1a2112f82edd15877bc3cbee58d6cb725d3afee4 100644 --- a/utils/rank_leaderboard.py +++ b/utils/rank_leaderboard.py @@ -8,124 +8,151 @@ from pathlib import Path from collections import defaultdict -def calculate_dimension_score(target_score, reference_score): - """计算单个维度的分数""" - return target_score / (target_score + reference_score) +def parse_race_result(race_result_file): + """解析race_result.txt文件获取各维度分数""" + scores = {} + + with open(race_result_file, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if ':' in line: + key, value = line.split(':', 1) + key = key.strip() + value = float(value.strip()) + + if key == 'Comprehensiveness': + scores['comprehensiveness'] = value * 100 + elif key == 'Insight': + scores['insight'] = value * 100 + elif key == 'Instruction Following': + scores['instruction_following'] = value * 100 + elif key == 'Readability': + scores['readability'] = value * 100 + elif key == 'Overall Score': + scores['overall_score'] = value * 100 + + return scores -def process_model_data(model_file): - """处理单个模型文件的数据""" - model_name = model_file.stem - print(f"正在处理模型: {model_name}") +def parse_fact_result(fact_result_file): + """解析fact_result.txt文件获取引用相关指标""" + citation_scores = {} - overall_scores = [] - comprehensiveness_scores = [] - insight_scores = [] - instruction_following_scores = [] - readability_scores = [] + if not fact_result_file.exists(): + return citation_scores - with open(model_file, 'r', encoding='utf-8') as f: + with open(fact_result_file, 'r', encoding='utf-8') as f: for line in f: - try: - data = json.loads(line.strip()) - - # 获取总分 - overall_score = data.get('overall_score', 0) - overall_scores.append(overall_score) + line = line.strip() + if ':' in line: + key, value = line.split(':', 1) + key = key.strip() + value = float(value.strip()) - # 计算四个维度的分数 - target_comp = data.get('target_comprehensiveness_weighted_avg', 0) - ref_comp = data.get('reference_comprehensiveness_weighted_avg', 0) - comp_score = calculate_dimension_score(target_comp, ref_comp) - comprehensiveness_scores.append(comp_score) - - target_insight = data.get('target_insight_weighted_avg', 0) - ref_insight = data.get('reference_insight_weighted_avg', 0) - insight_score = calculate_dimension_score(target_insight, ref_insight) - insight_scores.append(insight_score) - - target_instruction = data.get('target_instruction_following_weighted_avg', 0) - ref_instruction = data.get('reference_instruction_following_weighted_avg', 0) - instruction_score = calculate_dimension_score(target_instruction, ref_instruction) - instruction_following_scores.append(instruction_score) - - target_readability = data.get('target_readability_weighted_avg', 0) - ref_readability = data.get('reference_readability_weighted_avg', 0) - readability_score = calculate_dimension_score(target_readability, ref_readability) - readability_scores.append(readability_score) - - except json.JSONDecodeError as e: - print(f"解析JSON时出错 (模型: {model_name}): {e}") - continue - except Exception as e: - print(f"处理数据时出错 (模型: {model_name}): {e}") - continue + if key == 'valid_rate': + citation_scores['citation_accuracy'] = value * 100 + elif key == 'total_valid_citations': + citation_scores['effective_citations'] = value + elif key == 'supported_per_task': + citation_scores['effective_citations'] = value + + return citation_scores + + +def process_model_data(model_dir): + """处理单个模型文件夹的数据""" + model_name = model_dir.name + race_result_file = model_dir / "race_result.txt" + + if not race_result_file.exists(): + print(f"警告: 模型 {model_name} 的文件夹中未找到 race_result.txt") + return None - # 计算平均分 - avg_overall = sum(overall_scores) / len(overall_scores) - avg_comprehensiveness = sum(comprehensiveness_scores) / len(comprehensiveness_scores) - avg_insight = sum(insight_scores) / len(insight_scores) - avg_instruction_following = sum(instruction_following_scores) / len(instruction_following_scores) - avg_readability = sum(readability_scores) / len(readability_scores) - print(f" - 处理了 {len(overall_scores)} 条记录") - print(f" - 总分: {avg_overall:.4f}") + print(f"正在处理模型: {model_name}") - return { - 'model': model_name, - 'overall_score': avg_overall * 100, - 'comprehensiveness': avg_comprehensiveness * 100, - 'insight': avg_insight * 100, - 'instruction_following': avg_instruction_following * 100, - 'readability': avg_readability * 100 - } + try: + scores = parse_race_result(race_result_file) + + if not scores: + print(f" - 警告: 未能解析到有效分数") + return None + + # 查找对应的fact_result.txt文件 + project_root = Path(__file__).parent.parent + fact_results_dir = project_root / "data" / "fact_results" + fact_result_file = fact_results_dir / model_name / "fact_result.txt" + + citation_scores = parse_fact_result(fact_result_file) + + if citation_scores: + print(f" - 总分: {scores['overall_score']:.2f}, 引用准确率: {citation_scores.get('citation_accuracy', 'N/A'):.2f}%, 有效引用数: {citation_scores.get('effective_citations', 'N/A')}") + else: + print(f" - 总分: {scores['overall_score']:.2f}, 引用数据: 未找到") + + result = { + 'model': model_name, + 'overall_score': scores['overall_score'], + 'comprehensiveness': scores['comprehensiveness'], + 'insight': scores['insight'], + 'instruction_following': scores['instruction_following'], + 'readability': scores['readability'], + 'citation_accuracy': citation_scores.get('citation_accuracy', None), + 'effective_citations': citation_scores.get('effective_citations', None) + } + + return result + + except Exception as e: + print(f" - 错误: 处理文件时出错: {e}") + return None def rank_leaderboard(): """计算排行榜并保存到CSV""" - # 定义目录路径 project_root = Path(__file__).parent.parent input_dir = project_root / "data" / "raw_results" output_file = project_root / "data" / "leaderboard.csv" - # 获取所有JSONL文件 - input_files = list(input_dir.glob("*.jsonl")) - print(f"找到 {len(input_files)} 个模型结果文件") + model_dirs = [d for d in input_dir.iterdir() if d.is_dir()] + print(f"找到 {len(model_dirs)} 个模型文件夹") - if not input_files: - print("未找到任何JSONL文件") + if not model_dirs: + print("未找到任何模型文件夹") return - # 处理每个模型的数据 model_results = [] - for input_file in input_files: + for model_dir in model_dirs: try: - result = process_model_data(input_file) - model_results.append(result) + result = process_model_data(model_dir) + if result: + model_results.append(result) except Exception as e: - print(f"处理文件 {input_file.name} 时出错: {e}") + print(f"处理文件夹 {model_dir.name} 时出错: {e}") continue - # 按总分排序(降序) + # 按overall_score排序 model_results.sort(key=lambda x: x['overall_score'], reverse=True) # 写入CSV文件 with open(output_file, 'w', newline='', encoding='utf-8') as csvfile: - fieldnames = ['model', 'overall_score', 'comprehensiveness', 'insight', 'instruction_following', 'readability'] + fieldnames = ['model', 'overall_score', 'comprehensiveness', 'insight', 'instruction_following', 'readability', 'citation_accuracy', 'effective_citations'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - # 写入表头 writer.writeheader() - # 写入数据 for result in model_results: - writer.writerow({ + # 格式化数值,对于None值使用"-" + row = { 'model': result['model'], 'overall_score': f"{result['overall_score']:.2f}", 'comprehensiveness': f"{result['comprehensiveness']:.2f}", 'insight': f"{result['insight']:.2f}", 'instruction_following': f"{result['instruction_following']:.2f}", - 'readability': f"{result['readability']:.2f}" - }) + 'readability': f"{result['readability']:.2f}", + 'citation_accuracy': f"{result['citation_accuracy']:.2f}" if result['citation_accuracy'] is not None else "-", + 'effective_citations': f"{result['effective_citations']:.2f}" if result['effective_citations'] is not None else "-" + } + writer.writerow(row) print(f"\n排行榜已保存到: {output_file}") print(f"共处理了 {len(model_results)} 个模型")