Spaces:

Ayanami0730
/

DeepResearch-Leaderboard

Running

App Files Files Community

Ayanami0730 commited on Jul 15

Commit

1d11ffb

1 Parent(s): 2f64f8a

Update latest data

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

create_leaderboard.py +4 -1
data/data_viewer.jsonl +2 -2
data/fact_results/claude-3-5-sonnet-with-search/fact_result.txt +3 -0
data/fact_results/claude-3-7-sonnet-with-search/fact_result.txt +3 -0
data/fact_results/doubao-deepresearch/fact_result.txt +3 -0
data/fact_results/gemini-2.5-flash-preview-04-17/fact_result.txt +1 -0
data/fact_results/gemini-2.5-pro-deepresearch/fact_result.txt +3 -0
data/fact_results/gemini-2.5-pro-preview-05-06/fact_result.txt +1 -0
data/fact_results/gpt-4.1-mini/fact_result.txt +3 -0
data/fact_results/gpt-4.1/fact_result.txt +3 -0
data/fact_results/gpt-4o-mini-search-preview/fact_result.txt +3 -0
data/fact_results/gpt-4o-search-preview/fact_result.txt +3 -0
data/fact_results/grok-deeper-search/fact_result.txt +3 -0
data/fact_results/openai-deepresearch/fact_result.txt +3 -0
data/fact_results/perplexity-Research/fact_result.txt +3 -0
data/fact_results/sonar-pro/fact_result.txt +3 -0
data/fact_results/sonar-reasoning-pro/fact_result.txt +3 -0
data/fact_results/sonar-reasoning/fact_result.txt +3 -0
data/fact_results/sonar/fact_result.txt +3 -0
data/leaderboard.csv +19 -16
data/raw_data/claude-research.jsonl +3 -0
data/raw_data/doubao-deepresearch.jsonl +3 -0
data/raw_data/{gemini-2.5-flash-with-grounding.jsonl → gemini-2.5-flash-preview-04-17.jsonl} +0 -0
data/raw_data/gemini-2.5-pro-deepresearch.jsonl +2 -2
data/raw_data/{gemini-2.5-pro-with-grounding.jsonl → gemini-2.5-pro-preview-05-06.jsonl} +0 -0
data/raw_data/{gpt-4.1-mini-with-search.jsonl → gpt-4.1-mini.jsonl} +0 -0
data/raw_data/{gpt-4.1-with-search.jsonl → gpt-4.1.jsonl} +2 -2
data/raw_data/grok-deeper-search.jsonl +2 -2
data/raw_data/kimi-researcher.jsonl +3 -0
data/raw_data/openai-deepresearch.jsonl +2 -2
data/raw_data/perplexity-Research.jsonl +2 -2
data/raw_data/{perplexity-sonar-pro.jsonl → sonar-pro.jsonl} +1 -1
data/raw_data/{perplexity-sonar-reasoning-pro.jsonl → sonar-reasoning-pro.jsonl} +1 -1
data/raw_data/{perplexity-sonar-reasoning.jsonl → sonar-reasoning.jsonl} +0 -0
data/raw_data/{perplexity-sonar.jsonl → sonar.jsonl} +1 -1
data/raw_results/claude-3-5-sonnet-with-search.jsonl +0 -3
data/raw_results/claude-3-5-sonnet-with-search/race_result.txt +5 -0
data/raw_results/claude-3-5-sonnet-with-search/raw_results.jsonl +3 -0
data/raw_results/claude-3-7-sonnet-with-search.jsonl +0 -3
data/raw_results/claude-3-7-sonnet-with-search/race_result.txt +5 -0
data/raw_results/claude-3-7-sonnet-with-search/raw_results.jsonl +3 -0
data/raw_results/claude-research/race_result.txt +5 -0
data/raw_results/claude-research/raw_results.jsonl +3 -0
data/raw_results/doubao-deepresearch/race_result.txt +5 -0
data/raw_results/doubao-deepresearch/raw_results.jsonl +3 -0
data/raw_results/gemini-2.5-flash-preview-04-17/race_result.txt +5 -0
data/raw_results/gemini-2.5-flash-preview-04-17/raw_results.jsonl +3 -0
data/raw_results/gemini-2.5-flash-with-grounding.jsonl +0 -3
data/raw_results/gemini-2.5-pro-deepresearch.jsonl +0 -3
data/raw_results/gemini-2.5-pro-deepresearch/race_result.txt +5 -0

create_leaderboard.py CHANGED Viewed

@@ -66,7 +66,10 @@ with gr.Blocks(title="DeepResearch Bench") as demo:
       <a href="https://deepresearch-bench.github.io" target="_blank">Website</a> |
       <a href="https://arxiv.org/abs/2506.11763" target="_blank">Paper</a> |
       <a href="#" target="_blank">Eval Dataset</a> |
-      Total models: 16 | Last Update: 28 May 2025
     </div>
     """)

       <a href="https://deepresearch-bench.github.io" target="_blank">Website</a> |
       <a href="https://arxiv.org/abs/2506.11763" target="_blank">Paper</a> |
       <a href="#" target="_blank">Eval Dataset</a> |
+      Total models: 19 | Last Update: 29 Dec 2024<br>
+      <small style="color: #666; font-size: 0.9em;">
+        Race judge model: gemini-2.5-pro | Fact-checking models: gemini-2.5-flash
+      </small>
     </div>
     """)

data/data_viewer.jsonl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a7ab11f250f4ffd6bf9c74ff8dc1e68f86d7abbf4f6319164bb476177ad7bf6e
-size 28044256

 version https://git-lfs.github.com/spec/v1
+oid sha256:647067a9eec626525fa41f257123b5b35f9daf6e9862467e9dc259f987ce621f
+size 40834049

data/fact_results/claude-3-5-sonnet-with-search/fact_result.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+total_citations: 9.94
+total_valid_citations: 9.35
+valid_rate: 0.9406438631790744

data/fact_results/claude-3-7-sonnet-with-search/fact_result.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+total_citations: 28.07
+total_valid_citations: 24.51
+valid_rate: 0.8731742073387959

data/fact_results/doubao-deepresearch/fact_result.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+citations_per_task: 99.5510
+supported_per_task: 52.6224
+valid_rate: 0.5286

data/fact_results/gemini-2.5-flash-preview-04-17/fact_result.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ No tasks with valid results.

data/fact_results/gemini-2.5-pro-deepresearch/fact_result.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+citations_per_task: 211.1616
+supported_per_task: 165.3434
+valid_rate: 0.7830

data/fact_results/gemini-2.5-pro-preview-05-06/fact_result.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ No tasks with valid results.

data/fact_results/gpt-4.1-mini/fact_result.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+total_citations: 4.85
+total_valid_citations: 4.1
+valid_rate: 0.845360824742268

data/fact_results/gpt-4.1/fact_result.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+total_citations: 4.746987951807229
+total_valid_citations: 4.265060240963855
+valid_rate: 0.8984771573604061

data/fact_results/gpt-4o-mini-search-preview/fact_result.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+total_citations: 5.651162790697675
+total_valid_citations: 4.616279069767442
+valid_rate: 0.8168724279835391

data/fact_results/gpt-4o-search-preview/fact_result.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+total_citations: 5.825581395348837
+total_valid_citations: 5.046511627906977
+valid_rate: 0.8662674650698603

data/fact_results/grok-deeper-search/fact_result.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+total_citations: 11.741935483870968
+total_valid_citations: 8.580645161290322
+valid_rate: 0.7307692307692307

data/fact_results/openai-deepresearch/fact_result.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+total_citations: 53.04040404040404
+total_valid_citations: 39.78787878787879
+valid_rate: 0.7501428299371549

data/fact_results/perplexity-Research/fact_result.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+total_citations: 37.76
+total_valid_citations: 31.2
+valid_rate: 0.826271186440678

data/fact_results/sonar-pro/fact_result.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+total_citations: 21.01
+total_valid_citations: 16.75
+valid_rate: 0.7972394098048549

data/fact_results/sonar-reasoning-pro/fact_result.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+total_citations: 20.78
+total_valid_citations: 9.39
+valid_rate: 0.45187680461982677

data/fact_results/sonar-reasoning/fact_result.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+total_citations: 25.43
+total_valid_citations: 13.37
+valid_rate: 0.525756979944947

data/fact_results/sonar/fact_result.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+total_citations: 13.97872340425532
+total_valid_citations: 10.680851063829786
+valid_rate: 0.7640791476407914

data/leaderboard.csv CHANGED Viewed

@@ -1,17 +1,20 @@
 model,overall_score,comprehensiveness,insight,instruction_following,readability,citation_accuracy,effective_citations
-gemini-2.5-pro-deepresearch,48.88,48.53,48.50,49.18,49.44,81.44,111.21
-openai-deepresearch,46.98,46.87,45.25,49.27,47.14,77.96,40.79
-perplexity-Research,42.25,40.69,39.39,46.40,44.28,90.24,31.26
-claude-3-7-sonnet-with-search,40.67,38.99,37.66,45.77,41.46,93.68,32.48
-grok-deeper-search,40.24,37.97,35.37,46.30,44.05,83.59,8.15
-perplexity-sonar-reasoning-pro,40.22,37.38,36.11,45.66,44.74,39.36,8.35
-perplexity-sonar-reasoning,40.18,37.14,36.73,45.15,44.35,48.67,11.34
-perplexity-sonar-pro,38.93,36.38,34.26,44.70,43.35,78.66,14.74
-gemini-2.5-pro-with-grounding,35.12,34.06,29.79,41.67,37.16,81.81,32.88
-gpt-4o-search-preview,35.10,31.99,27.57,43.17,41.23,88.41,4.79
-perplexity-sonar,34.54,30.95,27.51,42.33,41.60,74.42,8.67
-gpt-4.1-with-search,33.46,29.42,25.38,42.33,40.77,87.83,4.42
-gemini-2.5-flash-preview-04-17,32.39,31.63,26.73,38.82,34.48,81.92,31.08
-gpt-4o-mini-search-preview,31.55,27.38,22.64,40.67,39.91,84.98,4.95
-gpt-4.1-mini-with-search,30.26,26.05,20.75,39.65,39.33,84.58,4.35
-claude-3-5-sonnet-with-search,28.48,24.82,22.82,35.12,35.08,94.04,9.78

 model,overall_score,comprehensiveness,insight,instruction_following,readability,citation_accuracy,effective_citations
+gemini-2.5-pro-deepresearch,48.92,48.45,48.30,49.29,49.77,78.30,165.34
+openai-deepresearch,46.45,46.46,43.73,49.39,47.22,75.01,39.79
+claude-research,45.00,45.34,42.79,47.58,44.66,-,-
+doubao-deepresearch,44.34,44.84,40.56,47.95,44.69,52.86,52.62
+kimi-researcher,42.69,42.82,39.40,45.30,44.68,-,-
+perplexity-Research,40.46,39.10,35.65,46.11,43.08,82.63,31.20
+grok-deeper-search,38.22,36.08,30.89,46.59,42.17,73.08,8.58
+sonar-reasoning-pro,37.76,34.96,31.65,44.93,42.42,45.19,9.39
+sonar-reasoning,37.75,34.73,32.59,44.42,42.39,52.58,13.37
+claude-3-7-sonnet-with-search,36.63,35.95,31.29,44.05,36.07,87.32,24.51
+sonar-pro,36.19,33.92,29.69,43.39,41.07,79.72,16.75
+gemini-2.5-pro-preview-05-06,31.90,31.75,24.61,40.24,32.76,-,-
+gpt-4o-search-preview,30.74,27.81,20.44,41.01,37.60,86.63,5.05
+sonar,30.64,27.14,21.62,40.70,37.46,76.41,10.68
+gpt-4.1,29.31,25.59,18.42,40.63,36.49,89.85,4.27
+gemini-2.5-flash-preview-04-17,29.19,28.97,21.62,37.80,29.97,-,-
+gpt-4o-mini-search-preview,27.62,24.24,16.62,38.59,35.27,81.69,4.62
+gpt-4.1-mini,26.62,22.86,15.39,38.18,34.49,84.54,4.10
+claude-3-5-sonnet-with-search,23.95,21.28,16.20,32.41,29.87,94.06,9.35

data/raw_data/claude-research.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:300f8dbc8242a5852bbe44098403f35fac1e4136e2274c93f0a3d659fee00d7f
+size 1513379

data/raw_data/doubao-deepresearch.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b9512cedf730486da486ba9d0ec305213ca8176bd00b93da788367f090717f2
+size 7451876

data/raw_data/{gemini-2.5-flash-with-grounding.jsonl → gemini-2.5-flash-preview-04-17.jsonl} RENAMED Viewed

File without changes

data/raw_data/gemini-2.5-pro-deepresearch.jsonl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33c5d28e76595f22fae1b0fbbe2700958bfe707dafe53f7c5842d3067ccfddef
-size 8523353

 version https://git-lfs.github.com/spec/v1
+oid sha256:3705106cc42a38b7c7bcd90e42aaec7f688a0d52179329e3074fcda99ea544e7
+size 8523153

data/raw_data/{gemini-2.5-pro-with-grounding.jsonl → gemini-2.5-pro-preview-05-06.jsonl} RENAMED Viewed

File without changes

data/raw_data/{gpt-4.1-mini-with-search.jsonl → gpt-4.1-mini.jsonl} RENAMED Viewed

File without changes

data/raw_data/{gpt-4.1-with-search.jsonl → gpt-4.1.jsonl} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e0228624b09e9d6c25c72156f4dd7f5702e3adcdd71a1f309094c2913eb50639
-size 492406

 version https://git-lfs.github.com/spec/v1
+oid sha256:a7611ff4a5cd8824aa711c9759d7dff02a990a57fbb36d699442484431bd5662
+size 492409

data/raw_data/grok-deeper-search.jsonl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ea6428dcf2e729d84f019c302fb3862a85cefbea08282b5ffcc5c400306ab077
-size 1149933

 version https://git-lfs.github.com/spec/v1
+oid sha256:f14c64de8c22d66b5a1c08af0cb0d829d9a4b671378a2952230b2219d258f0ba
+size 1149833

data/raw_data/kimi-researcher.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:868ac817d88b63a7a253ecf4439b85205ad1c49f2879f4b46f1a9a34d6cf804f
+size 3773315

data/raw_data/openai-deepresearch.jsonl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:77d31b8ea1abd9aa8e924303451dc6a0f334f2e9d4d61ec71847c4db004ac62a
-size 6903938

 version https://git-lfs.github.com/spec/v1
+oid sha256:8a9dbbf7f18d8c985bc4d4f450089eb4bb73e77dbf7168a1bb4c81f811e06d84
+size 6903838

data/raw_data/perplexity-Research.jsonl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7f27cb31cbab84f60efc3286592e84690fd117355dd84f9e4a9299108245c2a5
-size 1747979

 version https://git-lfs.github.com/spec/v1
+oid sha256:0a3b855862c99f108abf97b9e402b43eb4d3376c3ec93c2e0a9c871b70d0736e
+size 1747879

data/raw_data/{perplexity-sonar-pro.jsonl → sonar-pro.jsonl} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5d577c0a208b35eb2c0454c00c70b12759cd8a1687f730f2133d8f392c1831ee
 size 750234

 version https://git-lfs.github.com/spec/v1
+oid sha256:39dbb2c1eae9fe1bc32abaedebe75bbc643ba18ec25e3360726ffb9d514c52ec
 size 750234

data/raw_data/{perplexity-sonar-reasoning-pro.jsonl → sonar-reasoning-pro.jsonl} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e08c6c4094bbf0aa1749e7b1a45e856a6635b2df6afdf0de8eeafea99e7477fc
 size 495156

 version https://git-lfs.github.com/spec/v1
+oid sha256:ffa11b21375455127385b57c69442a2772aa96f0d5b410e274d994eacf920c00
 size 495156

data/raw_data/{perplexity-sonar-reasoning.jsonl → sonar-reasoning.jsonl} RENAMED Viewed

File without changes

data/raw_data/{perplexity-sonar.jsonl → sonar.jsonl} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dc0ef26282e404b700d56e158644f44228c49a3d5126fa12c8068e053444131e
 size 574856

 version https://git-lfs.github.com/spec/v1
+oid sha256:96626a86dca8c67848e8ce9e71e76c4cddec7066ba209e2018d13781ad23f17f
 size 574856

data/raw_results/claude-3-5-sonnet-with-search.jsonl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c0c47d1bab126886420bd53bb41a8905cdfb97f105711bcc2f5a27e3d53652ea
-size 1992421

data/raw_results/claude-3-5-sonnet-with-search/race_result.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Comprehensiveness: 0.2128
+Insight: 0.1620
+Instruction Following: 0.3241
+Readability: 0.2987
+Overall Score: 0.2395

data/raw_results/claude-3-5-sonnet-with-search/raw_results.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0423e9b7888c1070e43272284414d91deb490e38d48d0f7d6afc004db6fea291
+size 52686

data/raw_results/claude-3-7-sonnet-with-search.jsonl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2b3a6bf74400c89d24fa47853ab034ed3696ee0694c2d190ba83c3f5dcd8a0ef
-size 2002379

data/raw_results/claude-3-7-sonnet-with-search/race_result.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Comprehensiveness: 0.3595
+Insight: 0.3129
+Instruction Following: 0.4405
+Readability: 0.3607
+Overall Score: 0.3663

data/raw_results/claude-3-7-sonnet-with-search/raw_results.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fde53e238b0f658ccc09d1efb7d6a13e276cd9d67e9e722adc5531dc1561853
+size 52517

data/raw_results/claude-research/race_result.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Comprehensiveness: 0.4534
+Insight: 0.4279
+Instruction Following: 0.4758
+Readability: 0.4466
+Overall Score: 0.4500

data/raw_results/claude-research/raw_results.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6616b89c3b38eb5f4822cdf82326648652c3fb19b5aca5ada99552cba4f529
+size 52454

data/raw_results/doubao-deepresearch/race_result.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Comprehensiveness: 0.4484
+Insight: 0.4056
+Instruction Following: 0.4795
+Readability: 0.4469
+Overall Score: 0.4434

data/raw_results/doubao-deepresearch/raw_results.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:732d86bd51bfb687821590ecc227cbce30663d309069e812d6e4bba103ca5e2a
+size 51890

data/raw_results/gemini-2.5-flash-preview-04-17/race_result.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Comprehensiveness: 0.2897
+Insight: 0.2162
+Instruction Following: 0.3780
+Readability: 0.2997
+Overall Score: 0.2919

data/raw_results/gemini-2.5-flash-preview-04-17/raw_results.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ee20445df1bc4ca52e8c0de0d8f02d65e808df8f834b789fed749004e2bc3a4
+size 52637

data/raw_results/gemini-2.5-flash-with-grounding.jsonl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:43b9f71819babb5c00f65f0dd71d707323fb803c585bd74976f49cdc34ab80aa
-size 1951481

data/raw_results/gemini-2.5-pro-deepresearch.jsonl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ac2fc53c99697e3276c98d735ed630df6fa49d2972c70a5409adc1958ecaa7b7
-size 1937730

data/raw_results/gemini-2.5-pro-deepresearch/race_result.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Comprehensiveness: 0.4845
+Insight: 0.4830
+Instruction Following: 0.4929
+Readability: 0.4977
+Overall Score: 0.4892