diff --git a/create_leaderboard.py b/create_leaderboard.py
index 07ef86ca9cd7f5861566b7d128c087c2284368f1..839fa37b1853546e5017b5768777cbf13cb120ab 100644
--- a/create_leaderboard.py
+++ b/create_leaderboard.py
@@ -66,7 +66,10 @@ with gr.Blocks(title="DeepResearch Bench") as demo:
Website |
Paper |
Eval Dataset |
- Total models: 16 | Last Update: 28 May 2025
+ Total models: 19 | Last Update: 29 Dec 2024
+
+ Race judge model: gemini-2.5-pro | Fact-checking models: gemini-2.5-flash
+
""")
diff --git a/data/data_viewer.jsonl b/data/data_viewer.jsonl
index 5b61da5f814aa722858757c27b9b9ac5c2d9e2f6..8ba4d39ff095f435b49d964801d863e0d96a40b9 100644
--- a/data/data_viewer.jsonl
+++ b/data/data_viewer.jsonl
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:a7ab11f250f4ffd6bf9c74ff8dc1e68f86d7abbf4f6319164bb476177ad7bf6e
-size 28044256
+oid sha256:647067a9eec626525fa41f257123b5b35f9daf6e9862467e9dc259f987ce621f
+size 40834049
diff --git a/data/fact_results/claude-3-5-sonnet-with-search/fact_result.txt b/data/fact_results/claude-3-5-sonnet-with-search/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2ff98b53590e941a8acd6b98dd3cbc66244f62f9
--- /dev/null
+++ b/data/fact_results/claude-3-5-sonnet-with-search/fact_result.txt
@@ -0,0 +1,3 @@
+total_citations: 9.94
+total_valid_citations: 9.35
+valid_rate: 0.9406438631790744
diff --git a/data/fact_results/claude-3-7-sonnet-with-search/fact_result.txt b/data/fact_results/claude-3-7-sonnet-with-search/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6a37dedf9a28c0433bf2d1f131dcd6309f0e3b95
--- /dev/null
+++ b/data/fact_results/claude-3-7-sonnet-with-search/fact_result.txt
@@ -0,0 +1,3 @@
+total_citations: 28.07
+total_valid_citations: 24.51
+valid_rate: 0.8731742073387959
diff --git a/data/fact_results/doubao-deepresearch/fact_result.txt b/data/fact_results/doubao-deepresearch/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9a47562576f9f95fd0d3d0b39df1711c8f0e704d
--- /dev/null
+++ b/data/fact_results/doubao-deepresearch/fact_result.txt
@@ -0,0 +1,3 @@
+citations_per_task: 99.5510
+supported_per_task: 52.6224
+valid_rate: 0.5286
diff --git a/data/fact_results/gemini-2.5-flash-preview-04-17/fact_result.txt b/data/fact_results/gemini-2.5-flash-preview-04-17/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..befd1e7f66c36fef1bdd22c6d66a18689da0e707
--- /dev/null
+++ b/data/fact_results/gemini-2.5-flash-preview-04-17/fact_result.txt
@@ -0,0 +1 @@
+No tasks with valid results.
diff --git a/data/fact_results/gemini-2.5-pro-deepresearch/fact_result.txt b/data/fact_results/gemini-2.5-pro-deepresearch/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..da40e8b7211a8f3869c7dcc7473e55f772bab7c8
--- /dev/null
+++ b/data/fact_results/gemini-2.5-pro-deepresearch/fact_result.txt
@@ -0,0 +1,3 @@
+citations_per_task: 211.1616
+supported_per_task: 165.3434
+valid_rate: 0.7830
diff --git a/data/fact_results/gemini-2.5-pro-preview-05-06/fact_result.txt b/data/fact_results/gemini-2.5-pro-preview-05-06/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..befd1e7f66c36fef1bdd22c6d66a18689da0e707
--- /dev/null
+++ b/data/fact_results/gemini-2.5-pro-preview-05-06/fact_result.txt
@@ -0,0 +1 @@
+No tasks with valid results.
diff --git a/data/fact_results/gpt-4.1-mini/fact_result.txt b/data/fact_results/gpt-4.1-mini/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..665a51e8d338db73242be5ff3102c882d8cd7390
--- /dev/null
+++ b/data/fact_results/gpt-4.1-mini/fact_result.txt
@@ -0,0 +1,3 @@
+total_citations: 4.85
+total_valid_citations: 4.1
+valid_rate: 0.845360824742268
diff --git a/data/fact_results/gpt-4.1/fact_result.txt b/data/fact_results/gpt-4.1/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b74a4e7f6d445d1dc8a80a1a3a192062accce06d
--- /dev/null
+++ b/data/fact_results/gpt-4.1/fact_result.txt
@@ -0,0 +1,3 @@
+total_citations: 4.746987951807229
+total_valid_citations: 4.265060240963855
+valid_rate: 0.8984771573604061
diff --git a/data/fact_results/gpt-4o-mini-search-preview/fact_result.txt b/data/fact_results/gpt-4o-mini-search-preview/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e221efe2487d53e3582ca9738dc74958dad2812d
--- /dev/null
+++ b/data/fact_results/gpt-4o-mini-search-preview/fact_result.txt
@@ -0,0 +1,3 @@
+total_citations: 5.651162790697675
+total_valid_citations: 4.616279069767442
+valid_rate: 0.8168724279835391
diff --git a/data/fact_results/gpt-4o-search-preview/fact_result.txt b/data/fact_results/gpt-4o-search-preview/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9ffbdcc8cd3713d8bcad82828caa4d9d3d120868
--- /dev/null
+++ b/data/fact_results/gpt-4o-search-preview/fact_result.txt
@@ -0,0 +1,3 @@
+total_citations: 5.825581395348837
+total_valid_citations: 5.046511627906977
+valid_rate: 0.8662674650698603
diff --git a/data/fact_results/grok-deeper-search/fact_result.txt b/data/fact_results/grok-deeper-search/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4a9792615a44408ce67ff57f32674a8a625b7b29
--- /dev/null
+++ b/data/fact_results/grok-deeper-search/fact_result.txt
@@ -0,0 +1,3 @@
+total_citations: 11.741935483870968
+total_valid_citations: 8.580645161290322
+valid_rate: 0.7307692307692307
diff --git a/data/fact_results/openai-deepresearch/fact_result.txt b/data/fact_results/openai-deepresearch/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2dd776b43995475dc5dbad6f7053521854f9117b
--- /dev/null
+++ b/data/fact_results/openai-deepresearch/fact_result.txt
@@ -0,0 +1,3 @@
+total_citations: 53.04040404040404
+total_valid_citations: 39.78787878787879
+valid_rate: 0.7501428299371549
diff --git a/data/fact_results/perplexity-Research/fact_result.txt b/data/fact_results/perplexity-Research/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f0af84e34bc98cc6d7dbab0d78eae80ad78b6ac3
--- /dev/null
+++ b/data/fact_results/perplexity-Research/fact_result.txt
@@ -0,0 +1,3 @@
+total_citations: 37.76
+total_valid_citations: 31.2
+valid_rate: 0.826271186440678
diff --git a/data/fact_results/sonar-pro/fact_result.txt b/data/fact_results/sonar-pro/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..74f4102aa61797806770d0e4c3ba576bf8335974
--- /dev/null
+++ b/data/fact_results/sonar-pro/fact_result.txt
@@ -0,0 +1,3 @@
+total_citations: 21.01
+total_valid_citations: 16.75
+valid_rate: 0.7972394098048549
diff --git a/data/fact_results/sonar-reasoning-pro/fact_result.txt b/data/fact_results/sonar-reasoning-pro/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..02d6e3e611db6c1a65ec3a1fad6c4bcc81ccebcd
--- /dev/null
+++ b/data/fact_results/sonar-reasoning-pro/fact_result.txt
@@ -0,0 +1,3 @@
+total_citations: 20.78
+total_valid_citations: 9.39
+valid_rate: 0.45187680461982677
diff --git a/data/fact_results/sonar-reasoning/fact_result.txt b/data/fact_results/sonar-reasoning/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..535e9bc2665b13492f624d5d3622afbbfdfb79c6
--- /dev/null
+++ b/data/fact_results/sonar-reasoning/fact_result.txt
@@ -0,0 +1,3 @@
+total_citations: 25.43
+total_valid_citations: 13.37
+valid_rate: 0.525756979944947
diff --git a/data/fact_results/sonar/fact_result.txt b/data/fact_results/sonar/fact_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..15ed17aeb5fe191bec3ca90ab57f10cc3804f561
--- /dev/null
+++ b/data/fact_results/sonar/fact_result.txt
@@ -0,0 +1,3 @@
+total_citations: 13.97872340425532
+total_valid_citations: 10.680851063829786
+valid_rate: 0.7640791476407914
diff --git a/data/leaderboard.csv b/data/leaderboard.csv
index facdb1b2f115eb9ea4627520110806a07ae91d9b..bd6ee4f900bf1f337adef416f8087da4b482d770 100644
--- a/data/leaderboard.csv
+++ b/data/leaderboard.csv
@@ -1,17 +1,20 @@
model,overall_score,comprehensiveness,insight,instruction_following,readability,citation_accuracy,effective_citations
-gemini-2.5-pro-deepresearch,48.88,48.53,48.50,49.18,49.44,81.44,111.21
-openai-deepresearch,46.98,46.87,45.25,49.27,47.14,77.96,40.79
-perplexity-Research,42.25,40.69,39.39,46.40,44.28,90.24,31.26
-claude-3-7-sonnet-with-search,40.67,38.99,37.66,45.77,41.46,93.68,32.48
-grok-deeper-search,40.24,37.97,35.37,46.30,44.05,83.59,8.15
-perplexity-sonar-reasoning-pro,40.22,37.38,36.11,45.66,44.74,39.36,8.35
-perplexity-sonar-reasoning,40.18,37.14,36.73,45.15,44.35,48.67,11.34
-perplexity-sonar-pro,38.93,36.38,34.26,44.70,43.35,78.66,14.74
-gemini-2.5-pro-with-grounding,35.12,34.06,29.79,41.67,37.16,81.81,32.88
-gpt-4o-search-preview,35.10,31.99,27.57,43.17,41.23,88.41,4.79
-perplexity-sonar,34.54,30.95,27.51,42.33,41.60,74.42,8.67
-gpt-4.1-with-search,33.46,29.42,25.38,42.33,40.77,87.83,4.42
-gemini-2.5-flash-preview-04-17,32.39,31.63,26.73,38.82,34.48,81.92,31.08
-gpt-4o-mini-search-preview,31.55,27.38,22.64,40.67,39.91,84.98,4.95
-gpt-4.1-mini-with-search,30.26,26.05,20.75,39.65,39.33,84.58,4.35
-claude-3-5-sonnet-with-search,28.48,24.82,22.82,35.12,35.08,94.04,9.78
\ No newline at end of file
+gemini-2.5-pro-deepresearch,48.92,48.45,48.30,49.29,49.77,78.30,165.34
+openai-deepresearch,46.45,46.46,43.73,49.39,47.22,75.01,39.79
+claude-research,45.00,45.34,42.79,47.58,44.66,-,-
+doubao-deepresearch,44.34,44.84,40.56,47.95,44.69,52.86,52.62
+kimi-researcher,42.69,42.82,39.40,45.30,44.68,-,-
+perplexity-Research,40.46,39.10,35.65,46.11,43.08,82.63,31.20
+grok-deeper-search,38.22,36.08,30.89,46.59,42.17,73.08,8.58
+sonar-reasoning-pro,37.76,34.96,31.65,44.93,42.42,45.19,9.39
+sonar-reasoning,37.75,34.73,32.59,44.42,42.39,52.58,13.37
+claude-3-7-sonnet-with-search,36.63,35.95,31.29,44.05,36.07,87.32,24.51
+sonar-pro,36.19,33.92,29.69,43.39,41.07,79.72,16.75
+gemini-2.5-pro-preview-05-06,31.90,31.75,24.61,40.24,32.76,-,-
+gpt-4o-search-preview,30.74,27.81,20.44,41.01,37.60,86.63,5.05
+sonar,30.64,27.14,21.62,40.70,37.46,76.41,10.68
+gpt-4.1,29.31,25.59,18.42,40.63,36.49,89.85,4.27
+gemini-2.5-flash-preview-04-17,29.19,28.97,21.62,37.80,29.97,-,-
+gpt-4o-mini-search-preview,27.62,24.24,16.62,38.59,35.27,81.69,4.62
+gpt-4.1-mini,26.62,22.86,15.39,38.18,34.49,84.54,4.10
+claude-3-5-sonnet-with-search,23.95,21.28,16.20,32.41,29.87,94.06,9.35
diff --git a/data/raw_data/claude-research.jsonl b/data/raw_data/claude-research.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5fcaceb4db2b3c38d494b42d5f47461651a7bc11
--- /dev/null
+++ b/data/raw_data/claude-research.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:300f8dbc8242a5852bbe44098403f35fac1e4136e2274c93f0a3d659fee00d7f
+size 1513379
diff --git a/data/raw_data/doubao-deepresearch.jsonl b/data/raw_data/doubao-deepresearch.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7a2e78037c006e549a862aa1e12a30b894497722
--- /dev/null
+++ b/data/raw_data/doubao-deepresearch.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b9512cedf730486da486ba9d0ec305213ca8176bd00b93da788367f090717f2
+size 7451876
diff --git a/data/raw_data/gemini-2.5-flash-with-grounding.jsonl b/data/raw_data/gemini-2.5-flash-preview-04-17.jsonl
similarity index 100%
rename from data/raw_data/gemini-2.5-flash-with-grounding.jsonl
rename to data/raw_data/gemini-2.5-flash-preview-04-17.jsonl
diff --git a/data/raw_data/gemini-2.5-pro-deepresearch.jsonl b/data/raw_data/gemini-2.5-pro-deepresearch.jsonl
index 06ebc42cb73fab6016d60139b4c774971103e52a..8fe301da9499c3feaacea4dd2732d3bbe5a7a1e0 100644
--- a/data/raw_data/gemini-2.5-pro-deepresearch.jsonl
+++ b/data/raw_data/gemini-2.5-pro-deepresearch.jsonl
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:33c5d28e76595f22fae1b0fbbe2700958bfe707dafe53f7c5842d3067ccfddef
-size 8523353
+oid sha256:3705106cc42a38b7c7bcd90e42aaec7f688a0d52179329e3074fcda99ea544e7
+size 8523153
diff --git a/data/raw_data/gemini-2.5-pro-with-grounding.jsonl b/data/raw_data/gemini-2.5-pro-preview-05-06.jsonl
similarity index 100%
rename from data/raw_data/gemini-2.5-pro-with-grounding.jsonl
rename to data/raw_data/gemini-2.5-pro-preview-05-06.jsonl
diff --git a/data/raw_data/gpt-4.1-mini-with-search.jsonl b/data/raw_data/gpt-4.1-mini.jsonl
similarity index 100%
rename from data/raw_data/gpt-4.1-mini-with-search.jsonl
rename to data/raw_data/gpt-4.1-mini.jsonl
diff --git a/data/raw_data/gpt-4.1-with-search.jsonl b/data/raw_data/gpt-4.1-with-search.jsonl
deleted file mode 100644
index 03ca8b1b5a43fc274c4749e82c2e175ec226cc84..0000000000000000000000000000000000000000
--- a/data/raw_data/gpt-4.1-with-search.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e0228624b09e9d6c25c72156f4dd7f5702e3adcdd71a1f309094c2913eb50639
-size 492406
diff --git a/data/raw_data/gpt-4.1.jsonl b/data/raw_data/gpt-4.1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..19efed14017cd0d84f49e92b3814b3b6fe34ff72
--- /dev/null
+++ b/data/raw_data/gpt-4.1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7611ff4a5cd8824aa711c9759d7dff02a990a57fbb36d699442484431bd5662
+size 492409
diff --git a/data/raw_data/grok-deeper-search.jsonl b/data/raw_data/grok-deeper-search.jsonl
index 3af0f1a3a4366ea4f7f66c3972de52547d81795d..5a6b524d0b48b82851be7e3d9c965a8ed8ccafd9 100644
--- a/data/raw_data/grok-deeper-search.jsonl
+++ b/data/raw_data/grok-deeper-search.jsonl
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:ea6428dcf2e729d84f019c302fb3862a85cefbea08282b5ffcc5c400306ab077
-size 1149933
+oid sha256:f14c64de8c22d66b5a1c08af0cb0d829d9a4b671378a2952230b2219d258f0ba
+size 1149833
diff --git a/data/raw_data/kimi-researcher.jsonl b/data/raw_data/kimi-researcher.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d04c644a1d1e13665c60cd395a7bde11474f4ba1
--- /dev/null
+++ b/data/raw_data/kimi-researcher.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:868ac817d88b63a7a253ecf4439b85205ad1c49f2879f4b46f1a9a34d6cf804f
+size 3773315
diff --git a/data/raw_data/openai-deepresearch.jsonl b/data/raw_data/openai-deepresearch.jsonl
index 89f6322b4e7f791282100772ae2f47300509288a..8cb76d73b594224de123058b67d8a00d8c96970f 100644
--- a/data/raw_data/openai-deepresearch.jsonl
+++ b/data/raw_data/openai-deepresearch.jsonl
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:77d31b8ea1abd9aa8e924303451dc6a0f334f2e9d4d61ec71847c4db004ac62a
-size 6903938
+oid sha256:8a9dbbf7f18d8c985bc4d4f450089eb4bb73e77dbf7168a1bb4c81f811e06d84
+size 6903838
diff --git a/data/raw_data/perplexity-Research.jsonl b/data/raw_data/perplexity-Research.jsonl
index 14b989cca9757ec346ad21bdf63a15077a02b5af..e00b167dfb2b581a40fb84982234195fb9933225 100644
--- a/data/raw_data/perplexity-Research.jsonl
+++ b/data/raw_data/perplexity-Research.jsonl
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:7f27cb31cbab84f60efc3286592e84690fd117355dd84f9e4a9299108245c2a5
-size 1747979
+oid sha256:0a3b855862c99f108abf97b9e402b43eb4d3376c3ec93c2e0a9c871b70d0736e
+size 1747879
diff --git a/data/raw_data/perplexity-sonar-pro.jsonl b/data/raw_data/perplexity-sonar-pro.jsonl
deleted file mode 100644
index 56c0d03f64c5cd25b4907cd7254d2930afefb2d0..0000000000000000000000000000000000000000
--- a/data/raw_data/perplexity-sonar-pro.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5d577c0a208b35eb2c0454c00c70b12759cd8a1687f730f2133d8f392c1831ee
-size 750234
diff --git a/data/raw_data/perplexity-sonar-reasoning-pro.jsonl b/data/raw_data/perplexity-sonar-reasoning-pro.jsonl
deleted file mode 100644
index b649b3f834897f26017bb8c2515f629b2824473c..0000000000000000000000000000000000000000
--- a/data/raw_data/perplexity-sonar-reasoning-pro.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e08c6c4094bbf0aa1749e7b1a45e856a6635b2df6afdf0de8eeafea99e7477fc
-size 495156
diff --git a/data/raw_data/perplexity-sonar.jsonl b/data/raw_data/perplexity-sonar.jsonl
deleted file mode 100644
index 0052be2f8957ae0cef0794674113e9035e5c428c..0000000000000000000000000000000000000000
--- a/data/raw_data/perplexity-sonar.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dc0ef26282e404b700d56e158644f44228c49a3d5126fa12c8068e053444131e
-size 574856
diff --git a/data/raw_data/sonar-pro.jsonl b/data/raw_data/sonar-pro.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1adaad480adc82c3156ded24a94583f41d1f7470
--- /dev/null
+++ b/data/raw_data/sonar-pro.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39dbb2c1eae9fe1bc32abaedebe75bbc643ba18ec25e3360726ffb9d514c52ec
+size 750234
diff --git a/data/raw_data/sonar-reasoning-pro.jsonl b/data/raw_data/sonar-reasoning-pro.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7fa5906943b0173e9ddc0f56fe3d88e8ed2fe60b
--- /dev/null
+++ b/data/raw_data/sonar-reasoning-pro.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffa11b21375455127385b57c69442a2772aa96f0d5b410e274d994eacf920c00
+size 495156
diff --git a/data/raw_data/perplexity-sonar-reasoning.jsonl b/data/raw_data/sonar-reasoning.jsonl
similarity index 100%
rename from data/raw_data/perplexity-sonar-reasoning.jsonl
rename to data/raw_data/sonar-reasoning.jsonl
diff --git a/data/raw_data/sonar.jsonl b/data/raw_data/sonar.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7e3544ee01fd82117e61e2f79610ea94b401916a
--- /dev/null
+++ b/data/raw_data/sonar.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96626a86dca8c67848e8ce9e71e76c4cddec7066ba209e2018d13781ad23f17f
+size 574856
diff --git a/data/raw_results/claude-3-5-sonnet-with-search.jsonl b/data/raw_results/claude-3-5-sonnet-with-search.jsonl
deleted file mode 100644
index 6cbf4daaed0703e6af45f79f84ec8b7de9b73551..0000000000000000000000000000000000000000
--- a/data/raw_results/claude-3-5-sonnet-with-search.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c0c47d1bab126886420bd53bb41a8905cdfb97f105711bcc2f5a27e3d53652ea
-size 1992421
diff --git a/data/raw_results/claude-3-5-sonnet-with-search/race_result.txt b/data/raw_results/claude-3-5-sonnet-with-search/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4950288405425e05e3e491e7715c091f2e813b47
--- /dev/null
+++ b/data/raw_results/claude-3-5-sonnet-with-search/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.2128
+Insight: 0.1620
+Instruction Following: 0.3241
+Readability: 0.2987
+Overall Score: 0.2395
diff --git a/data/raw_results/claude-3-5-sonnet-with-search/raw_results.jsonl b/data/raw_results/claude-3-5-sonnet-with-search/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..48e8689fcb74e25c0b58a8b6b66455c5abfa4898
--- /dev/null
+++ b/data/raw_results/claude-3-5-sonnet-with-search/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0423e9b7888c1070e43272284414d91deb490e38d48d0f7d6afc004db6fea291
+size 52686
diff --git a/data/raw_results/claude-3-7-sonnet-with-search.jsonl b/data/raw_results/claude-3-7-sonnet-with-search.jsonl
deleted file mode 100644
index 4d4b9e9e111c47cc90050482a374c1c1ddfb3893..0000000000000000000000000000000000000000
--- a/data/raw_results/claude-3-7-sonnet-with-search.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2b3a6bf74400c89d24fa47853ab034ed3696ee0694c2d190ba83c3f5dcd8a0ef
-size 2002379
diff --git a/data/raw_results/claude-3-7-sonnet-with-search/race_result.txt b/data/raw_results/claude-3-7-sonnet-with-search/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..eae703902d64d312abd5e4078c10b9ec0aff474e
--- /dev/null
+++ b/data/raw_results/claude-3-7-sonnet-with-search/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.3595
+Insight: 0.3129
+Instruction Following: 0.4405
+Readability: 0.3607
+Overall Score: 0.3663
diff --git a/data/raw_results/claude-3-7-sonnet-with-search/raw_results.jsonl b/data/raw_results/claude-3-7-sonnet-with-search/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4ef93f616f5d3e24b0916d49b1482211c320293e
--- /dev/null
+++ b/data/raw_results/claude-3-7-sonnet-with-search/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0fde53e238b0f658ccc09d1efb7d6a13e276cd9d67e9e722adc5531dc1561853
+size 52517
diff --git a/data/raw_results/claude-research/race_result.txt b/data/raw_results/claude-research/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0fb132c7a92322f3f021dec1453377cd9dfa7cb1
--- /dev/null
+++ b/data/raw_results/claude-research/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.4534
+Insight: 0.4279
+Instruction Following: 0.4758
+Readability: 0.4466
+Overall Score: 0.4500
diff --git a/data/raw_results/claude-research/raw_results.jsonl b/data/raw_results/claude-research/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..24868ca43ca1a8b7334cb535810ff3809c643d43
--- /dev/null
+++ b/data/raw_results/claude-research/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6616b89c3b38eb5f4822cdf82326648652c3fb19b5aca5ada99552cba4f529
+size 52454
diff --git a/data/raw_results/doubao-deepresearch/race_result.txt b/data/raw_results/doubao-deepresearch/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dd6f591e6b8c979d9b071ca503a001110cad00a3
--- /dev/null
+++ b/data/raw_results/doubao-deepresearch/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.4484
+Insight: 0.4056
+Instruction Following: 0.4795
+Readability: 0.4469
+Overall Score: 0.4434
diff --git a/data/raw_results/doubao-deepresearch/raw_results.jsonl b/data/raw_results/doubao-deepresearch/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6c93261832f7be8b84d3e4b6339870a56865038c
--- /dev/null
+++ b/data/raw_results/doubao-deepresearch/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:732d86bd51bfb687821590ecc227cbce30663d309069e812d6e4bba103ca5e2a
+size 51890
diff --git a/data/raw_results/gemini-2.5-flash-preview-04-17/race_result.txt b/data/raw_results/gemini-2.5-flash-preview-04-17/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1e84bbaeced824bc43630875b1ae6f8b582b4a3e
--- /dev/null
+++ b/data/raw_results/gemini-2.5-flash-preview-04-17/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.2897
+Insight: 0.2162
+Instruction Following: 0.3780
+Readability: 0.2997
+Overall Score: 0.2919
diff --git a/data/raw_results/gemini-2.5-flash-preview-04-17/raw_results.jsonl b/data/raw_results/gemini-2.5-flash-preview-04-17/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2b7d939a134e08cd34755c66e812adec21e4d684
--- /dev/null
+++ b/data/raw_results/gemini-2.5-flash-preview-04-17/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ee20445df1bc4ca52e8c0de0d8f02d65e808df8f834b789fed749004e2bc3a4
+size 52637
diff --git a/data/raw_results/gemini-2.5-flash-with-grounding.jsonl b/data/raw_results/gemini-2.5-flash-with-grounding.jsonl
deleted file mode 100644
index 06b805115fa66dfdac2aa885a6d5fe5d09129c37..0000000000000000000000000000000000000000
--- a/data/raw_results/gemini-2.5-flash-with-grounding.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:43b9f71819babb5c00f65f0dd71d707323fb803c585bd74976f49cdc34ab80aa
-size 1951481
diff --git a/data/raw_results/gemini-2.5-pro-deepresearch.jsonl b/data/raw_results/gemini-2.5-pro-deepresearch.jsonl
deleted file mode 100644
index d0cfc1e3352805fd32bbec4397b7945440da9337..0000000000000000000000000000000000000000
--- a/data/raw_results/gemini-2.5-pro-deepresearch.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ac2fc53c99697e3276c98d735ed630df6fa49d2972c70a5409adc1958ecaa7b7
-size 1937730
diff --git a/data/raw_results/gemini-2.5-pro-deepresearch/race_result.txt b/data/raw_results/gemini-2.5-pro-deepresearch/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3462a5d40845565c4e6651cb43984d73ea161c3f
--- /dev/null
+++ b/data/raw_results/gemini-2.5-pro-deepresearch/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.4845
+Insight: 0.4830
+Instruction Following: 0.4929
+Readability: 0.4977
+Overall Score: 0.4892
diff --git a/data/raw_results/gemini-2.5-pro-deepresearch/raw_results.jsonl b/data/raw_results/gemini-2.5-pro-deepresearch/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4eecca39d953880d6f9b4dc909a027df9d967d65
--- /dev/null
+++ b/data/raw_results/gemini-2.5-pro-deepresearch/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34e1afac1851c1e81b65f1f3844aa8da886ef20558d7891ce7145f7c63cc53ca
+size 51986
diff --git a/data/raw_results/gemini-2.5-pro-preview-05-06/race_result.txt b/data/raw_results/gemini-2.5-pro-preview-05-06/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e34a15ab4d1fbf3ac85495ec1e1c29fbdcc4fe2d
--- /dev/null
+++ b/data/raw_results/gemini-2.5-pro-preview-05-06/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.3175
+Insight: 0.2461
+Instruction Following: 0.4024
+Readability: 0.3276
+Overall Score: 0.3190
diff --git a/data/raw_results/gemini-2.5-pro-preview-05-06/raw_results.jsonl b/data/raw_results/gemini-2.5-pro-preview-05-06/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d739db7b21b5b2d01d394c45df9e15ebcd9ea358
--- /dev/null
+++ b/data/raw_results/gemini-2.5-pro-preview-05-06/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce921d7368ee00c8a2d573331fd42e5b813af29ecfc367048cdb3b1367ff3e43
+size 52566
diff --git a/data/raw_results/gemini-2.5-pro-with-grounding.jsonl b/data/raw_results/gemini-2.5-pro-with-grounding.jsonl
deleted file mode 100644
index 64b458d9c012cecfd581bbef650f5a5fea526a8d..0000000000000000000000000000000000000000
--- a/data/raw_results/gemini-2.5-pro-with-grounding.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5e911a18cf8b8a8207eb45584ac650e4640f79db7352055ca5e92356de37f911
-size 1944815
diff --git a/data/raw_results/gpt-4.1-mini-with-search.jsonl b/data/raw_results/gpt-4.1-mini-with-search.jsonl
deleted file mode 100644
index afb748e6922224b0767dd4944dc7ff0e242c118b..0000000000000000000000000000000000000000
--- a/data/raw_results/gpt-4.1-mini-with-search.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:948a403d12bcf6b0e3ce6664f83afeb95413684ab0b7912003ed756a4df15c5e
-size 1992345
diff --git a/data/raw_results/gpt-4.1-mini/race_result.txt b/data/raw_results/gpt-4.1-mini/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f3954c80e00f551808c7674a9b48137359576c68
--- /dev/null
+++ b/data/raw_results/gpt-4.1-mini/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.2286
+Insight: 0.1539
+Instruction Following: 0.3818
+Readability: 0.3449
+Overall Score: 0.2662
diff --git a/data/raw_results/gpt-4.1-mini/raw_results.jsonl b/data/raw_results/gpt-4.1-mini/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..14bc78e7ce47cacee0957c251d66cdad703eb2fa
--- /dev/null
+++ b/data/raw_results/gpt-4.1-mini/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b47556452f7b784c3c4f6d730e058937706fa5adeea4aeb93a30a710dfa6412
+size 52638
diff --git a/data/raw_results/gpt-4.1-with-search.jsonl b/data/raw_results/gpt-4.1-with-search.jsonl
deleted file mode 100644
index a2b0e22d3e3e4d03bca5442dbbdf52fe402705d8..0000000000000000000000000000000000000000
--- a/data/raw_results/gpt-4.1-with-search.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:908a5989af337e381bf2bce6795438edd21966f313b5194f532feb1f47e5b812
-size 2090582
diff --git a/data/raw_results/gpt-4.1/race_result.txt b/data/raw_results/gpt-4.1/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dc508b619460f8620a860b0a82c4cacb67867fae
--- /dev/null
+++ b/data/raw_results/gpt-4.1/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.2559
+Insight: 0.1842
+Instruction Following: 0.4063
+Readability: 0.3649
+Overall Score: 0.2931
diff --git a/data/raw_results/gpt-4.1/raw_results.jsonl b/data/raw_results/gpt-4.1/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..adad3e3f4c8f7671c0fb472fbf17f00bf6da6b33
--- /dev/null
+++ b/data/raw_results/gpt-4.1/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1a7903c48fb04860b0bb1718ce7004d492dbab5068545d2a321b80c5f16f711
+size 52618
diff --git a/data/raw_results/gpt-4o-mini-search-preview.jsonl b/data/raw_results/gpt-4o-mini-search-preview.jsonl
deleted file mode 100644
index 7279175fb94fb3cdb888c7285edeebf9d1967f07..0000000000000000000000000000000000000000
--- a/data/raw_results/gpt-4o-mini-search-preview.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4277a9a91fcdaaeff1afe948c1088095d5f01092404fcd1a62407b7a58b7906e
-size 2074673
diff --git a/data/raw_results/gpt-4o-mini-search-preview/race_result.txt b/data/raw_results/gpt-4o-mini-search-preview/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d43e0dca99168528d11ce47febd36d539fba116e
--- /dev/null
+++ b/data/raw_results/gpt-4o-mini-search-preview/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.2424
+Insight: 0.1662
+Instruction Following: 0.3859
+Readability: 0.3527
+Overall Score: 0.2762
diff --git a/data/raw_results/gpt-4o-mini-search-preview/raw_results.jsonl b/data/raw_results/gpt-4o-mini-search-preview/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..163c29b07451c25438c0ab9048bfd0cf5b4d63ef
--- /dev/null
+++ b/data/raw_results/gpt-4o-mini-search-preview/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34ec46e38b055a970b4086fb2db9cfa98330793add7ff2757c02acc55de68b32
+size 52637
diff --git a/data/raw_results/gpt-4o-search-preview.jsonl b/data/raw_results/gpt-4o-search-preview.jsonl
deleted file mode 100644
index 58cae9bc4dfa1f4d9d8aff2f7f388c399c34f14b..0000000000000000000000000000000000000000
--- a/data/raw_results/gpt-4o-search-preview.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7adcd70d49d3b5dd6050201aa4fcd31f51288945f4a23de14432a301cbf295a7
-size 2063854
diff --git a/data/raw_results/gpt-4o-search-preview/race_result.txt b/data/raw_results/gpt-4o-search-preview/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..397872a5c7b26a3ed633af238378e409518e2978
--- /dev/null
+++ b/data/raw_results/gpt-4o-search-preview/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.2781
+Insight: 0.2044
+Instruction Following: 0.4101
+Readability: 0.3760
+Overall Score: 0.3074
diff --git a/data/raw_results/gpt-4o-search-preview/raw_results.jsonl b/data/raw_results/gpt-4o-search-preview/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..37fc5c1304f6fed1283b08e9a34196dca57ee537
--- /dev/null
+++ b/data/raw_results/gpt-4o-search-preview/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6ad7042da675dccdf90749986ce95fdc0f9bc2df2b3cd4db50bff4e70ff3ab5
+size 52617
diff --git a/data/raw_results/grok-deeper-search.jsonl b/data/raw_results/grok-deeper-search.jsonl
deleted file mode 100644
index 00b5fe934660df3f6a024886f4e51e5dfac0ed94..0000000000000000000000000000000000000000
--- a/data/raw_results/grok-deeper-search.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b19fb7ec93872317eae94abeb02ed9c19912057acfa82600167ca853b750f476
-size 1968989
diff --git a/data/raw_results/grok-deeper-search/race_result.txt b/data/raw_results/grok-deeper-search/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..db282b350dcbccac12e458645ce174165eda99a1
--- /dev/null
+++ b/data/raw_results/grok-deeper-search/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.3608
+Insight: 0.3089
+Instruction Following: 0.4659
+Readability: 0.4217
+Overall Score: 0.3822
diff --git a/data/raw_results/grok-deeper-search/raw_results.jsonl b/data/raw_results/grok-deeper-search/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2071550228b1407a7879d5c60d9e5f1f58b492a6
--- /dev/null
+++ b/data/raw_results/grok-deeper-search/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56d1a924942aadf2cbab3fb718d654cdb25de59383991cbf340c58eef0671abd
+size 52491
diff --git a/data/raw_results/kimi-researcher/race_result.txt b/data/raw_results/kimi-researcher/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..67b954e0ada304d7d2de4e25ac6233d6250f8373
--- /dev/null
+++ b/data/raw_results/kimi-researcher/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.4282
+Insight: 0.3940
+Instruction Following: 0.4530
+Readability: 0.4468
+Overall Score: 0.4269
diff --git a/data/raw_results/kimi-researcher/raw_results.jsonl b/data/raw_results/kimi-researcher/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9d7841429343616a2cde51a98b7ab6470496ec2b
--- /dev/null
+++ b/data/raw_results/kimi-researcher/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2117bd7eb0cc91e705d64cb3013b2bf89cffce190f57b0fde5638a2efd6f027d
+size 52510
diff --git a/data/raw_results/openai-deepresearch.jsonl b/data/raw_results/openai-deepresearch.jsonl
deleted file mode 100644
index 603263d5b3d798707cc8950b76c5877c18943e31..0000000000000000000000000000000000000000
--- a/data/raw_results/openai-deepresearch.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ae45c25f5b5c56a772331543e4eefe7c80e63f33b441dfe83cb4a5c830c88a35
-size 2007501
diff --git a/data/raw_results/openai-deepresearch/race_result.txt b/data/raw_results/openai-deepresearch/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..94181c515d754b55aaf8be6f01ec2cecfdc9b64a
--- /dev/null
+++ b/data/raw_results/openai-deepresearch/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.4646
+Insight: 0.4373
+Instruction Following: 0.4939
+Readability: 0.4722
+Overall Score: 0.4645
diff --git a/data/raw_results/openai-deepresearch/raw_results.jsonl b/data/raw_results/openai-deepresearch/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e255f35c53b9be22e7c755e0a5d260d3f1c082c9
--- /dev/null
+++ b/data/raw_results/openai-deepresearch/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ba1919ed5dc8b57e65aaabfec2bfaf8cbacd9b5ac69ca753de6110c21a100e7
+size 52313
diff --git a/data/raw_results/perplexity-Research.jsonl b/data/raw_results/perplexity-Research.jsonl
deleted file mode 100644
index 7aaab6c2c3979022f50cf4b639be8de8a36ae40e..0000000000000000000000000000000000000000
--- a/data/raw_results/perplexity-Research.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b7715271d17cc344873653464ae3fef884e0f3c6bec89deee347ed7a0651beb9
-size 2030483
diff --git a/data/raw_results/perplexity-Research/race_result.txt b/data/raw_results/perplexity-Research/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..29c6e53812660ef84379b87f7a11c8e29fca47a5
--- /dev/null
+++ b/data/raw_results/perplexity-Research/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.3910
+Insight: 0.3565
+Instruction Following: 0.4611
+Readability: 0.4308
+Overall Score: 0.4046
diff --git a/data/raw_results/perplexity-Research/raw_results.jsonl b/data/raw_results/perplexity-Research/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0a7838b67822ce1a5bbb0ac9765cb7c60bb24549
--- /dev/null
+++ b/data/raw_results/perplexity-Research/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1141aa12a92d2026982cd899799ee773ece898211097f5e5cb3bdadf91a9a199
+size 52489
diff --git a/data/raw_results/perplexity-sonar-pro.jsonl b/data/raw_results/perplexity-sonar-pro.jsonl
deleted file mode 100644
index 26188e17b7dd8c314c23e1022dc93c2a4ac581fe..0000000000000000000000000000000000000000
--- a/data/raw_results/perplexity-sonar-pro.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a453f5b29492f684f53364121e7c79eeb81aee2737a383e2748830a4e4453afb
-size 1975770
diff --git a/data/raw_results/perplexity-sonar-reasoning-pro.jsonl b/data/raw_results/perplexity-sonar-reasoning-pro.jsonl
deleted file mode 100644
index de695f14d14d614715ff9b2acb6ad6fd4801f506..0000000000000000000000000000000000000000
--- a/data/raw_results/perplexity-sonar-reasoning-pro.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:669a4a14232c63c716de766af7be050f8712f74a6d5437cc8fa637ded39f3c40
-size 1957092
diff --git a/data/raw_results/perplexity-sonar-reasoning.jsonl b/data/raw_results/perplexity-sonar-reasoning.jsonl
deleted file mode 100644
index ca14b740222c630835f04801a5f485cf3fa22fb2..0000000000000000000000000000000000000000
--- a/data/raw_results/perplexity-sonar-reasoning.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bceb5637a9d0092af5ddcca49557a4f8f3604be9ebb430be32e820fa4d6723b3
-size 1951258
diff --git a/data/raw_results/perplexity-sonar.jsonl b/data/raw_results/perplexity-sonar.jsonl
deleted file mode 100644
index 240df6ffd5c6e230e1e22264cd4f476fe8634b8a..0000000000000000000000000000000000000000
--- a/data/raw_results/perplexity-sonar.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:36ecd1540447863f66bfe1a43905070f9c9b0d40de803348c3450a396df3d8fc
-size 2016838
diff --git a/data/raw_results/sonar-pro/race_result.txt b/data/raw_results/sonar-pro/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ac3847542f354f41d0d0644cf29a38e4569e4e88
--- /dev/null
+++ b/data/raw_results/sonar-pro/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.3392
+Insight: 0.2969
+Instruction Following: 0.4339
+Readability: 0.4107
+Overall Score: 0.3619
diff --git a/data/raw_results/sonar-pro/raw_results.jsonl b/data/raw_results/sonar-pro/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bd0804de5cacfba2df10d1090b90977aab7cdc4d
--- /dev/null
+++ b/data/raw_results/sonar-pro/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4badd5671a8bccf51ad1ad1f4617925abed30815153ecaa1b8eb3ceec4f46834
+size 52573
diff --git a/data/raw_results/sonar-reasoning-pro/race_result.txt b/data/raw_results/sonar-reasoning-pro/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..86ccf604bd5843241260b3ab34fbcff1055b16f9
--- /dev/null
+++ b/data/raw_results/sonar-reasoning-pro/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.3496
+Insight: 0.3165
+Instruction Following: 0.4493
+Readability: 0.4242
+Overall Score: 0.3776
diff --git a/data/raw_results/sonar-reasoning-pro/raw_results.jsonl b/data/raw_results/sonar-reasoning-pro/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ae08519af3511e23a7234999f0022b1b24c5b7e4
--- /dev/null
+++ b/data/raw_results/sonar-reasoning-pro/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:416a07a69a027d6dff5985260f8dffa2af7ea87456ca93e5fe33e003b68da15e
+size 52469
diff --git a/data/raw_results/sonar-reasoning/race_result.txt b/data/raw_results/sonar-reasoning/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7d52936c03f6f313f4c0f6c2ff201b71942a6c0c
--- /dev/null
+++ b/data/raw_results/sonar-reasoning/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.3473
+Insight: 0.3259
+Instruction Following: 0.4442
+Readability: 0.4239
+Overall Score: 0.3775
diff --git a/data/raw_results/sonar-reasoning/raw_results.jsonl b/data/raw_results/sonar-reasoning/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c6a5127accc7c79ae3e1a57e56bdcda6cfe2e553
--- /dev/null
+++ b/data/raw_results/sonar-reasoning/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:930e0af6b6ac7a97c4bd4dc55b9c95aab5e14a630c887fd69a69d58e367d667f
+size 52498
diff --git a/data/raw_results/sonar/race_result.txt b/data/raw_results/sonar/race_result.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1d7a2d099e0d187f205be03c261c67a30afc64c7
--- /dev/null
+++ b/data/raw_results/sonar/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.2714
+Insight: 0.2162
+Instruction Following: 0.4070
+Readability: 0.3746
+Overall Score: 0.3064
diff --git a/data/raw_results/sonar/raw_results.jsonl b/data/raw_results/sonar/raw_results.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..29214cf9cbee22e72728a90be45d87e0c00a36f0
--- /dev/null
+++ b/data/raw_results/sonar/raw_results.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9c0b00be855c7d8372c6b3c0e14f242e96f96e193ebefb06fd8176d63633678
+size 52590
diff --git a/tabs/leaderboard_tab.py b/tabs/leaderboard_tab.py
index 01747eee4767f88cedea4798bb57c2382e6f7d66..5b46e1f91b47b5bfa9f81c6c3c88ab560e285405 100644
--- a/tabs/leaderboard_tab.py
+++ b/tabs/leaderboard_tab.py
@@ -29,21 +29,24 @@ MODEL_CATEGORIES = {
"gemini-2.5-pro-deepresearch",
"grok-deeper-search",
"openai-deepresearch",
- "perplexity-Research"
+ "perplexity-Research",
+ "doubao-deepresearch",
+ "kimi-researcher",
+ "claude-research"
],
"LLM with Search": [
"claude-3-7-sonnet-with-search",
- "perplexity-sonar-reasoning-pro",
- "perplexity-sonar-reasoning",
- "perplexity-sonar-pro",
- "gemini-2.5-pro-with-grounding",
+ "claude-3-5-sonnet-with-search",
+ "sonar-reasoning-pro",
+ "sonar-reasoning",
+ "sonar-pro",
+ "sonar",
+ "gemini-2.5-pro-preview-05-06",
"gpt-4o-search-preview",
- "perplexity-sonar",
- "gpt-4.1-with-search",
+ "gpt-4.1",
"gemini-2.5-flash-preview-04-17",
"gpt-4o-mini-search-preview",
- "gpt-4.1-mini-with-search",
- "claude-3-5-sonnet-with-search"
+ "gpt-4.1-mini"
]
}
diff --git a/utils/merge_raw_data.py b/utils/merge_raw_data.py
index 0c46dfc8762cda0ed8af5d4801a359f09860b844..487d00c4a19deba1675239a9a097eb5af4164a29 100644
--- a/utils/merge_raw_data.py
+++ b/utils/merge_raw_data.py
@@ -6,76 +6,53 @@ import os
from pathlib import Path
-def calculate_dimension_score(target_score, reference_score):
- """计算单个维度的分数,与rank_leaderboard.py中的逻辑一致"""
- if (target_score + reference_score) == 0: # 避免除以零
- return 0.0
- return target_score / (target_score + reference_score)
-
-
-def load_scores_for_model(model_results_file_path: Path):
- """为单个模型加载所有文章的评分数据"""
+def load_scores_for_model(model_results_dir: Path):
scores_by_id = {}
- if not model_results_file_path.exists():
- print(f"警告: 未找到模型 {model_results_file_path.stem} 的结果文件: {model_results_file_path}")
+ raw_results_file = model_results_dir / "raw_results.jsonl"
+
+ if not raw_results_file.exists():
+ print(f"警告: 未找到模型 {model_results_dir.name} 的结果文件: {raw_results_file}")
return scores_by_id
- print(f" 正在从 {model_results_file_path.name} 加载分数...")
- with open(model_results_file_path, 'r', encoding='utf-8') as f:
+ print(f" 正在从 {model_results_dir.name}/raw_results.jsonl 加载分数...")
+ with open(raw_results_file, 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
try:
data = json.loads(line.strip())
- article_id = str(data.get('id')) # 确保ID是字符串以供匹配
+ article_id = str(data.get('id'))
if not article_id:
- print(f" 警告: {model_results_file_path.name} 第 {i+1} 行缺少ID,已跳过。")
+ print(f" 警告: {model_results_dir.name} 第 {i+1} 行缺少ID,已跳过。")
continue
- # 直接获取 overall_score (原始值,假设在0-1范围,或者已经是0-100,根据您的数据调整)
- # 根据您之前的修改,这里我们假设原始overall_score需要乘以100
overall_score_raw = data.get('overall_score', 0.0)
overall_score_scaled = overall_score_raw * 100
- # 计算四个维度的分数
- comp_score_raw = calculate_dimension_score(
- data.get('target_comprehensiveness_weighted_avg', 0),
- data.get('reference_comprehensiveness_weighted_avg', 0)
- )
- insight_score_raw = calculate_dimension_score(
- data.get('target_insight_weighted_avg', 0),
- data.get('reference_insight_weighted_avg', 0)
- )
- instruction_score_raw = calculate_dimension_score(
- data.get('target_instruction_following_weighted_avg', 0),
- data.get('reference_instruction_following_weighted_avg', 0)
- )
- readability_score_raw = calculate_dimension_score(
- data.get('target_readability_weighted_avg', 0),
- data.get('reference_readability_weighted_avg', 0)
- )
+ comprehensiveness_score_raw = data.get('comprehensiveness', 0.0)
+ insight_score_raw = data.get('insight', 0.0)
+ instruction_score_raw = data.get('instruction_following', 0.0)
+ readability_score_raw = data.get('readability', 0.0)
scores_by_id[article_id] = {
'overall_score': f"{overall_score_scaled:.2f}",
- 'comprehensiveness_score': f"{comp_score_raw * 100:.2f}",
+ 'comprehensiveness_score': f"{comprehensiveness_score_raw * 100:.2f}",
'insight_score': f"{insight_score_raw * 100:.2f}",
'instruction_following_score': f"{instruction_score_raw * 100:.2f}",
'readability_score': f"{readability_score_raw * 100:.2f}"
}
except json.JSONDecodeError as e:
- print(f" 错误: 解析JSON时出错 (文件: {model_results_file_path.name}, 行: {i+1}): {e}")
+ print(f" 错误: 解析JSON时出错 (文件: {model_results_dir.name}, 行: {i+1}): {e}")
except Exception as e:
- print(f" 错误: 处理数据时出错 (文件: {model_results_file_path.name}, 行: {i+1}): {e}")
- print(f" 为模型 {model_results_file_path.stem} 加载了 {len(scores_by_id)}篇文章的分数")
+ print(f" 错误: 处理数据时出错 (文件: {model_results_dir.name}, 行: {i+1}): {e}")
+ print(f" 为模型 {model_results_dir.name} 加载了 {len(scores_by_id)}篇文章的分数")
return scores_by_id
def merge_jsonl_files():
- # 定义目录路径
project_root = Path(__file__).resolve().parent.parent
- raw_data_dir = project_root / "data" / "raw_data" # 包含原始文章内容的目录
- raw_results_dir = project_root / "data" / "raw_results" # 包含评分结果的目录
+ raw_data_dir = project_root / "data" / "raw_data"
+ raw_results_dir = project_root / "data" / "raw_results"
output_file = project_root / "data" / "data_viewer.jsonl"
- # 获取所有原始数据JSONL文件
input_files = list(raw_data_dir.glob("*.jsonl"))
print(f"在 {raw_data_dir} 中找到 {len(input_files)} 个模型JSONL文件")
@@ -83,9 +60,8 @@ def merge_jsonl_files():
print("未找到任何原始数据文件,已退出。")
return
- # 清空输出文件 (如果需要,或者可以采用追加模式,但通常合并操作会重新生成)
with open(output_file, 'w', encoding='utf-8') as f:
- pass # 创建或清空文件
+ pass
all_merged_data = []
@@ -93,22 +69,24 @@ def merge_jsonl_files():
model_name = raw_data_file.stem
print(f"正在处理原始数据文件: {raw_data_file.name} (模型: {model_name})")
- # 为当前模型加载评分数据
- model_results_file = raw_results_dir / f"{model_name}.jsonl"
- scores_for_current_model = load_scores_for_model(model_results_file)
+ model_results_dir = raw_results_dir / model_name
+ if not model_results_dir.exists():
+ print(f" 警告: 未找到模型 {model_name} 对应的结果文件夹: {model_results_dir}")
+ continue
+
+ scores_for_current_model = load_scores_for_model(model_results_dir)
processed_articles_count = 0
with open(raw_data_file, 'r', encoding='utf-8') as f_raw:
for i, line in enumerate(f_raw):
try:
article_data = json.loads(line.strip())
- article_id = str(article_data.get('id')) # 确保ID是字符串
+ article_id = str(article_data.get('id'))
if not article_id:
print(f" 警告: {raw_data_file.name} 第 {i+1} 行缺少ID,已跳过。")
continue
- # 从加载的评分数据中获取该文章的评分
article_scores = scores_for_current_model.get(article_id, {})
if not article_scores:
print(f" 警告: 模型 {model_name} 的文章ID {article_id} 未在结果文件中找到分数。")
@@ -118,7 +96,7 @@ def merge_jsonl_files():
'id': article_id,
'prompt': article_data.get('prompt'),
'article': article_data.get('article'),
- 'overall_score': article_scores.get('overall_score'), # 可能为None
+ 'overall_score': article_scores.get('overall_score'),
'comprehensiveness_score': article_scores.get('comprehensiveness_score'),
'insight_score': article_scores.get('insight_score'),
'instruction_following_score': article_scores.get('instruction_following_score'),
@@ -132,7 +110,6 @@ def merge_jsonl_files():
print(f" 错误: 处理原始数据时出错 (文件: {raw_data_file.name}, 行: {i+1}): {e}")
print(f" 为模型 {model_name} 处理了 {processed_articles_count} 篇文章数据。")
- # 一次性写入所有合并后的数据
with open(output_file, 'w', encoding='utf-8') as f_out:
for item in all_merged_data:
f_out.write(json.dumps(item, ensure_ascii=False) + '\n')
diff --git a/utils/rank_leaderboard.py b/utils/rank_leaderboard.py
index 86a2754e949f85cb23d98af5593d2d7beada91ce..1a2112f82edd15877bc3cbee58d6cb725d3afee4 100644
--- a/utils/rank_leaderboard.py
+++ b/utils/rank_leaderboard.py
@@ -8,124 +8,151 @@ from pathlib import Path
from collections import defaultdict
-def calculate_dimension_score(target_score, reference_score):
- """计算单个维度的分数"""
- return target_score / (target_score + reference_score)
+def parse_race_result(race_result_file):
+ """解析race_result.txt文件获取各维度分数"""
+ scores = {}
+
+ with open(race_result_file, 'r', encoding='utf-8') as f:
+ for line in f:
+ line = line.strip()
+ if ':' in line:
+ key, value = line.split(':', 1)
+ key = key.strip()
+ value = float(value.strip())
+
+ if key == 'Comprehensiveness':
+ scores['comprehensiveness'] = value * 100
+ elif key == 'Insight':
+ scores['insight'] = value * 100
+ elif key == 'Instruction Following':
+ scores['instruction_following'] = value * 100
+ elif key == 'Readability':
+ scores['readability'] = value * 100
+ elif key == 'Overall Score':
+ scores['overall_score'] = value * 100
+
+ return scores
-def process_model_data(model_file):
- """处理单个模型文件的数据"""
- model_name = model_file.stem
- print(f"正在处理模型: {model_name}")
+def parse_fact_result(fact_result_file):
+ """解析fact_result.txt文件获取引用相关指标"""
+ citation_scores = {}
- overall_scores = []
- comprehensiveness_scores = []
- insight_scores = []
- instruction_following_scores = []
- readability_scores = []
+ if not fact_result_file.exists():
+ return citation_scores
- with open(model_file, 'r', encoding='utf-8') as f:
+ with open(fact_result_file, 'r', encoding='utf-8') as f:
for line in f:
- try:
- data = json.loads(line.strip())
-
- # 获取总分
- overall_score = data.get('overall_score', 0)
- overall_scores.append(overall_score)
+ line = line.strip()
+ if ':' in line:
+ key, value = line.split(':', 1)
+ key = key.strip()
+ value = float(value.strip())
- # 计算四个维度的分数
- target_comp = data.get('target_comprehensiveness_weighted_avg', 0)
- ref_comp = data.get('reference_comprehensiveness_weighted_avg', 0)
- comp_score = calculate_dimension_score(target_comp, ref_comp)
- comprehensiveness_scores.append(comp_score)
-
- target_insight = data.get('target_insight_weighted_avg', 0)
- ref_insight = data.get('reference_insight_weighted_avg', 0)
- insight_score = calculate_dimension_score(target_insight, ref_insight)
- insight_scores.append(insight_score)
-
- target_instruction = data.get('target_instruction_following_weighted_avg', 0)
- ref_instruction = data.get('reference_instruction_following_weighted_avg', 0)
- instruction_score = calculate_dimension_score(target_instruction, ref_instruction)
- instruction_following_scores.append(instruction_score)
-
- target_readability = data.get('target_readability_weighted_avg', 0)
- ref_readability = data.get('reference_readability_weighted_avg', 0)
- readability_score = calculate_dimension_score(target_readability, ref_readability)
- readability_scores.append(readability_score)
-
- except json.JSONDecodeError as e:
- print(f"解析JSON时出错 (模型: {model_name}): {e}")
- continue
- except Exception as e:
- print(f"处理数据时出错 (模型: {model_name}): {e}")
- continue
+ if key == 'valid_rate':
+ citation_scores['citation_accuracy'] = value * 100
+ elif key == 'total_valid_citations':
+ citation_scores['effective_citations'] = value
+ elif key == 'supported_per_task':
+ citation_scores['effective_citations'] = value
+
+ return citation_scores
+
+
+def process_model_data(model_dir):
+ """处理单个模型文件夹的数据"""
+ model_name = model_dir.name
+ race_result_file = model_dir / "race_result.txt"
+
+ if not race_result_file.exists():
+ print(f"警告: 模型 {model_name} 的文件夹中未找到 race_result.txt")
+ return None
- # 计算平均分
- avg_overall = sum(overall_scores) / len(overall_scores)
- avg_comprehensiveness = sum(comprehensiveness_scores) / len(comprehensiveness_scores)
- avg_insight = sum(insight_scores) / len(insight_scores)
- avg_instruction_following = sum(instruction_following_scores) / len(instruction_following_scores)
- avg_readability = sum(readability_scores) / len(readability_scores)
- print(f" - 处理了 {len(overall_scores)} 条记录")
- print(f" - 总分: {avg_overall:.4f}")
+ print(f"正在处理模型: {model_name}")
- return {
- 'model': model_name,
- 'overall_score': avg_overall * 100,
- 'comprehensiveness': avg_comprehensiveness * 100,
- 'insight': avg_insight * 100,
- 'instruction_following': avg_instruction_following * 100,
- 'readability': avg_readability * 100
- }
+ try:
+ scores = parse_race_result(race_result_file)
+
+ if not scores:
+ print(f" - 警告: 未能解析到有效分数")
+ return None
+
+ # 查找对应的fact_result.txt文件
+ project_root = Path(__file__).parent.parent
+ fact_results_dir = project_root / "data" / "fact_results"
+ fact_result_file = fact_results_dir / model_name / "fact_result.txt"
+
+ citation_scores = parse_fact_result(fact_result_file)
+
+ if citation_scores:
+ print(f" - 总分: {scores['overall_score']:.2f}, 引用准确率: {citation_scores.get('citation_accuracy', 'N/A'):.2f}%, 有效引用数: {citation_scores.get('effective_citations', 'N/A')}")
+ else:
+ print(f" - 总分: {scores['overall_score']:.2f}, 引用数据: 未找到")
+
+ result = {
+ 'model': model_name,
+ 'overall_score': scores['overall_score'],
+ 'comprehensiveness': scores['comprehensiveness'],
+ 'insight': scores['insight'],
+ 'instruction_following': scores['instruction_following'],
+ 'readability': scores['readability'],
+ 'citation_accuracy': citation_scores.get('citation_accuracy', None),
+ 'effective_citations': citation_scores.get('effective_citations', None)
+ }
+
+ return result
+
+ except Exception as e:
+ print(f" - 错误: 处理文件时出错: {e}")
+ return None
def rank_leaderboard():
"""计算排行榜并保存到CSV"""
- # 定义目录路径
project_root = Path(__file__).parent.parent
input_dir = project_root / "data" / "raw_results"
output_file = project_root / "data" / "leaderboard.csv"
- # 获取所有JSONL文件
- input_files = list(input_dir.glob("*.jsonl"))
- print(f"找到 {len(input_files)} 个模型结果文件")
+ model_dirs = [d for d in input_dir.iterdir() if d.is_dir()]
+ print(f"找到 {len(model_dirs)} 个模型文件夹")
- if not input_files:
- print("未找到任何JSONL文件")
+ if not model_dirs:
+ print("未找到任何模型文件夹")
return
- # 处理每个模型的数据
model_results = []
- for input_file in input_files:
+ for model_dir in model_dirs:
try:
- result = process_model_data(input_file)
- model_results.append(result)
+ result = process_model_data(model_dir)
+ if result:
+ model_results.append(result)
except Exception as e:
- print(f"处理文件 {input_file.name} 时出错: {e}")
+ print(f"处理文件夹 {model_dir.name} 时出错: {e}")
continue
- # 按总分排序(降序)
+ # 按overall_score排序
model_results.sort(key=lambda x: x['overall_score'], reverse=True)
# 写入CSV文件
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
- fieldnames = ['model', 'overall_score', 'comprehensiveness', 'insight', 'instruction_following', 'readability']
+ fieldnames = ['model', 'overall_score', 'comprehensiveness', 'insight', 'instruction_following', 'readability', 'citation_accuracy', 'effective_citations']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
- # 写入表头
writer.writeheader()
- # 写入数据
for result in model_results:
- writer.writerow({
+ # 格式化数值,对于None值使用"-"
+ row = {
'model': result['model'],
'overall_score': f"{result['overall_score']:.2f}",
'comprehensiveness': f"{result['comprehensiveness']:.2f}",
'insight': f"{result['insight']:.2f}",
'instruction_following': f"{result['instruction_following']:.2f}",
- 'readability': f"{result['readability']:.2f}"
- })
+ 'readability': f"{result['readability']:.2f}",
+ 'citation_accuracy': f"{result['citation_accuracy']:.2f}" if result['citation_accuracy'] is not None else "-",
+ 'effective_citations': f"{result['effective_citations']:.2f}" if result['effective_citations'] is not None else "-"
+ }
+ writer.writerow(row)
print(f"\n排行榜已保存到: {output_file}")
print(f"共处理了 {len(model_results)} 个模型")