Ayanami0730 commited on
Commit
1d11ffb
Β·
1 Parent(s): 2f64f8a

Update latest data

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. create_leaderboard.py +4 -1
  2. data/data_viewer.jsonl +2 -2
  3. data/fact_results/claude-3-5-sonnet-with-search/fact_result.txt +3 -0
  4. data/fact_results/claude-3-7-sonnet-with-search/fact_result.txt +3 -0
  5. data/fact_results/doubao-deepresearch/fact_result.txt +3 -0
  6. data/fact_results/gemini-2.5-flash-preview-04-17/fact_result.txt +1 -0
  7. data/fact_results/gemini-2.5-pro-deepresearch/fact_result.txt +3 -0
  8. data/fact_results/gemini-2.5-pro-preview-05-06/fact_result.txt +1 -0
  9. data/fact_results/gpt-4.1-mini/fact_result.txt +3 -0
  10. data/fact_results/gpt-4.1/fact_result.txt +3 -0
  11. data/fact_results/gpt-4o-mini-search-preview/fact_result.txt +3 -0
  12. data/fact_results/gpt-4o-search-preview/fact_result.txt +3 -0
  13. data/fact_results/grok-deeper-search/fact_result.txt +3 -0
  14. data/fact_results/openai-deepresearch/fact_result.txt +3 -0
  15. data/fact_results/perplexity-Research/fact_result.txt +3 -0
  16. data/fact_results/sonar-pro/fact_result.txt +3 -0
  17. data/fact_results/sonar-reasoning-pro/fact_result.txt +3 -0
  18. data/fact_results/sonar-reasoning/fact_result.txt +3 -0
  19. data/fact_results/sonar/fact_result.txt +3 -0
  20. data/leaderboard.csv +19 -16
  21. data/raw_data/claude-research.jsonl +3 -0
  22. data/raw_data/doubao-deepresearch.jsonl +3 -0
  23. data/raw_data/{gemini-2.5-flash-with-grounding.jsonl β†’ gemini-2.5-flash-preview-04-17.jsonl} +0 -0
  24. data/raw_data/gemini-2.5-pro-deepresearch.jsonl +2 -2
  25. data/raw_data/{gemini-2.5-pro-with-grounding.jsonl β†’ gemini-2.5-pro-preview-05-06.jsonl} +0 -0
  26. data/raw_data/{gpt-4.1-mini-with-search.jsonl β†’ gpt-4.1-mini.jsonl} +0 -0
  27. data/raw_data/{gpt-4.1-with-search.jsonl β†’ gpt-4.1.jsonl} +2 -2
  28. data/raw_data/grok-deeper-search.jsonl +2 -2
  29. data/raw_data/kimi-researcher.jsonl +3 -0
  30. data/raw_data/openai-deepresearch.jsonl +2 -2
  31. data/raw_data/perplexity-Research.jsonl +2 -2
  32. data/raw_data/{perplexity-sonar-pro.jsonl β†’ sonar-pro.jsonl} +1 -1
  33. data/raw_data/{perplexity-sonar-reasoning-pro.jsonl β†’ sonar-reasoning-pro.jsonl} +1 -1
  34. data/raw_data/{perplexity-sonar-reasoning.jsonl β†’ sonar-reasoning.jsonl} +0 -0
  35. data/raw_data/{perplexity-sonar.jsonl β†’ sonar.jsonl} +1 -1
  36. data/raw_results/claude-3-5-sonnet-with-search.jsonl +0 -3
  37. data/raw_results/claude-3-5-sonnet-with-search/race_result.txt +5 -0
  38. data/raw_results/claude-3-5-sonnet-with-search/raw_results.jsonl +3 -0
  39. data/raw_results/claude-3-7-sonnet-with-search.jsonl +0 -3
  40. data/raw_results/claude-3-7-sonnet-with-search/race_result.txt +5 -0
  41. data/raw_results/claude-3-7-sonnet-with-search/raw_results.jsonl +3 -0
  42. data/raw_results/claude-research/race_result.txt +5 -0
  43. data/raw_results/claude-research/raw_results.jsonl +3 -0
  44. data/raw_results/doubao-deepresearch/race_result.txt +5 -0
  45. data/raw_results/doubao-deepresearch/raw_results.jsonl +3 -0
  46. data/raw_results/gemini-2.5-flash-preview-04-17/race_result.txt +5 -0
  47. data/raw_results/gemini-2.5-flash-preview-04-17/raw_results.jsonl +3 -0
  48. data/raw_results/gemini-2.5-flash-with-grounding.jsonl +0 -3
  49. data/raw_results/gemini-2.5-pro-deepresearch.jsonl +0 -3
  50. data/raw_results/gemini-2.5-pro-deepresearch/race_result.txt +5 -0
create_leaderboard.py CHANGED
@@ -66,7 +66,10 @@ with gr.Blocks(title="DeepResearch Bench") as demo:
66
  <a href="https://deepresearch-bench.github.io" target="_blank">Website</a> |
67
  <a href="https://arxiv.org/abs/2506.11763" target="_blank">Paper</a> |
68
  <a href="#" target="_blank">Eval Dataset</a> |
69
- Total models: 16 | Last Update: 28 May 2025
 
 
 
70
  </div>
71
  """)
72
 
 
66
  <a href="https://deepresearch-bench.github.io" target="_blank">Website</a> |
67
  <a href="https://arxiv.org/abs/2506.11763" target="_blank">Paper</a> |
68
  <a href="#" target="_blank">Eval Dataset</a> |
69
+ Total models: 19 | Last Update: 29 Dec 2024<br>
70
+ <small style="color: #666; font-size: 0.9em;">
71
+ Race judge model: gemini-2.5-pro | Fact-checking models: gemini-2.5-flash
72
+ </small>
73
  </div>
74
  """)
75
 
data/data_viewer.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7ab11f250f4ffd6bf9c74ff8dc1e68f86d7abbf4f6319164bb476177ad7bf6e
3
- size 28044256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:647067a9eec626525fa41f257123b5b35f9daf6e9862467e9dc259f987ce621f
3
+ size 40834049
data/fact_results/claude-3-5-sonnet-with-search/fact_result.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ total_citations: 9.94
2
+ total_valid_citations: 9.35
3
+ valid_rate: 0.9406438631790744
data/fact_results/claude-3-7-sonnet-with-search/fact_result.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ total_citations: 28.07
2
+ total_valid_citations: 24.51
3
+ valid_rate: 0.8731742073387959
data/fact_results/doubao-deepresearch/fact_result.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ citations_per_task: 99.5510
2
+ supported_per_task: 52.6224
3
+ valid_rate: 0.5286
data/fact_results/gemini-2.5-flash-preview-04-17/fact_result.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ No tasks with valid results.
data/fact_results/gemini-2.5-pro-deepresearch/fact_result.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ citations_per_task: 211.1616
2
+ supported_per_task: 165.3434
3
+ valid_rate: 0.7830
data/fact_results/gemini-2.5-pro-preview-05-06/fact_result.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ No tasks with valid results.
data/fact_results/gpt-4.1-mini/fact_result.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ total_citations: 4.85
2
+ total_valid_citations: 4.1
3
+ valid_rate: 0.845360824742268
data/fact_results/gpt-4.1/fact_result.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ total_citations: 4.746987951807229
2
+ total_valid_citations: 4.265060240963855
3
+ valid_rate: 0.8984771573604061
data/fact_results/gpt-4o-mini-search-preview/fact_result.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ total_citations: 5.651162790697675
2
+ total_valid_citations: 4.616279069767442
3
+ valid_rate: 0.8168724279835391
data/fact_results/gpt-4o-search-preview/fact_result.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ total_citations: 5.825581395348837
2
+ total_valid_citations: 5.046511627906977
3
+ valid_rate: 0.8662674650698603
data/fact_results/grok-deeper-search/fact_result.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ total_citations: 11.741935483870968
2
+ total_valid_citations: 8.580645161290322
3
+ valid_rate: 0.7307692307692307
data/fact_results/openai-deepresearch/fact_result.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ total_citations: 53.04040404040404
2
+ total_valid_citations: 39.78787878787879
3
+ valid_rate: 0.7501428299371549
data/fact_results/perplexity-Research/fact_result.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ total_citations: 37.76
2
+ total_valid_citations: 31.2
3
+ valid_rate: 0.826271186440678
data/fact_results/sonar-pro/fact_result.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ total_citations: 21.01
2
+ total_valid_citations: 16.75
3
+ valid_rate: 0.7972394098048549
data/fact_results/sonar-reasoning-pro/fact_result.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ total_citations: 20.78
2
+ total_valid_citations: 9.39
3
+ valid_rate: 0.45187680461982677
data/fact_results/sonar-reasoning/fact_result.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ total_citations: 25.43
2
+ total_valid_citations: 13.37
3
+ valid_rate: 0.525756979944947
data/fact_results/sonar/fact_result.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ total_citations: 13.97872340425532
2
+ total_valid_citations: 10.680851063829786
3
+ valid_rate: 0.7640791476407914
data/leaderboard.csv CHANGED
@@ -1,17 +1,20 @@
1
  model,overall_score,comprehensiveness,insight,instruction_following,readability,citation_accuracy,effective_citations
2
- gemini-2.5-pro-deepresearch,48.88,48.53,48.50,49.18,49.44,81.44,111.21
3
- openai-deepresearch,46.98,46.87,45.25,49.27,47.14,77.96,40.79
4
- perplexity-Research,42.25,40.69,39.39,46.40,44.28,90.24,31.26
5
- claude-3-7-sonnet-with-search,40.67,38.99,37.66,45.77,41.46,93.68,32.48
6
- grok-deeper-search,40.24,37.97,35.37,46.30,44.05,83.59,8.15
7
- perplexity-sonar-reasoning-pro,40.22,37.38,36.11,45.66,44.74,39.36,8.35
8
- perplexity-sonar-reasoning,40.18,37.14,36.73,45.15,44.35,48.67,11.34
9
- perplexity-sonar-pro,38.93,36.38,34.26,44.70,43.35,78.66,14.74
10
- gemini-2.5-pro-with-grounding,35.12,34.06,29.79,41.67,37.16,81.81,32.88
11
- gpt-4o-search-preview,35.10,31.99,27.57,43.17,41.23,88.41,4.79
12
- perplexity-sonar,34.54,30.95,27.51,42.33,41.60,74.42,8.67
13
- gpt-4.1-with-search,33.46,29.42,25.38,42.33,40.77,87.83,4.42
14
- gemini-2.5-flash-preview-04-17,32.39,31.63,26.73,38.82,34.48,81.92,31.08
15
- gpt-4o-mini-search-preview,31.55,27.38,22.64,40.67,39.91,84.98,4.95
16
- gpt-4.1-mini-with-search,30.26,26.05,20.75,39.65,39.33,84.58,4.35
17
- claude-3-5-sonnet-with-search,28.48,24.82,22.82,35.12,35.08,94.04,9.78
 
 
 
 
1
  model,overall_score,comprehensiveness,insight,instruction_following,readability,citation_accuracy,effective_citations
2
+ gemini-2.5-pro-deepresearch,48.92,48.45,48.30,49.29,49.77,78.30,165.34
3
+ openai-deepresearch,46.45,46.46,43.73,49.39,47.22,75.01,39.79
4
+ claude-research,45.00,45.34,42.79,47.58,44.66,-,-
5
+ doubao-deepresearch,44.34,44.84,40.56,47.95,44.69,52.86,52.62
6
+ kimi-researcher,42.69,42.82,39.40,45.30,44.68,-,-
7
+ perplexity-Research,40.46,39.10,35.65,46.11,43.08,82.63,31.20
8
+ grok-deeper-search,38.22,36.08,30.89,46.59,42.17,73.08,8.58
9
+ sonar-reasoning-pro,37.76,34.96,31.65,44.93,42.42,45.19,9.39
10
+ sonar-reasoning,37.75,34.73,32.59,44.42,42.39,52.58,13.37
11
+ claude-3-7-sonnet-with-search,36.63,35.95,31.29,44.05,36.07,87.32,24.51
12
+ sonar-pro,36.19,33.92,29.69,43.39,41.07,79.72,16.75
13
+ gemini-2.5-pro-preview-05-06,31.90,31.75,24.61,40.24,32.76,-,-
14
+ gpt-4o-search-preview,30.74,27.81,20.44,41.01,37.60,86.63,5.05
15
+ sonar,30.64,27.14,21.62,40.70,37.46,76.41,10.68
16
+ gpt-4.1,29.31,25.59,18.42,40.63,36.49,89.85,4.27
17
+ gemini-2.5-flash-preview-04-17,29.19,28.97,21.62,37.80,29.97,-,-
18
+ gpt-4o-mini-search-preview,27.62,24.24,16.62,38.59,35.27,81.69,4.62
19
+ gpt-4.1-mini,26.62,22.86,15.39,38.18,34.49,84.54,4.10
20
+ claude-3-5-sonnet-with-search,23.95,21.28,16.20,32.41,29.87,94.06,9.35
data/raw_data/claude-research.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:300f8dbc8242a5852bbe44098403f35fac1e4136e2274c93f0a3d659fee00d7f
3
+ size 1513379
data/raw_data/doubao-deepresearch.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b9512cedf730486da486ba9d0ec305213ca8176bd00b93da788367f090717f2
3
+ size 7451876
data/raw_data/{gemini-2.5-flash-with-grounding.jsonl β†’ gemini-2.5-flash-preview-04-17.jsonl} RENAMED
File without changes
data/raw_data/gemini-2.5-pro-deepresearch.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33c5d28e76595f22fae1b0fbbe2700958bfe707dafe53f7c5842d3067ccfddef
3
- size 8523353
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3705106cc42a38b7c7bcd90e42aaec7f688a0d52179329e3074fcda99ea544e7
3
+ size 8523153
data/raw_data/{gemini-2.5-pro-with-grounding.jsonl β†’ gemini-2.5-pro-preview-05-06.jsonl} RENAMED
File without changes
data/raw_data/{gpt-4.1-mini-with-search.jsonl β†’ gpt-4.1-mini.jsonl} RENAMED
File without changes
data/raw_data/{gpt-4.1-with-search.jsonl β†’ gpt-4.1.jsonl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0228624b09e9d6c25c72156f4dd7f5702e3adcdd71a1f309094c2913eb50639
3
- size 492406
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7611ff4a5cd8824aa711c9759d7dff02a990a57fbb36d699442484431bd5662
3
+ size 492409
data/raw_data/grok-deeper-search.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea6428dcf2e729d84f019c302fb3862a85cefbea08282b5ffcc5c400306ab077
3
- size 1149933
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f14c64de8c22d66b5a1c08af0cb0d829d9a4b671378a2952230b2219d258f0ba
3
+ size 1149833
data/raw_data/kimi-researcher.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:868ac817d88b63a7a253ecf4439b85205ad1c49f2879f4b46f1a9a34d6cf804f
3
+ size 3773315
data/raw_data/openai-deepresearch.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:77d31b8ea1abd9aa8e924303451dc6a0f334f2e9d4d61ec71847c4db004ac62a
3
- size 6903938
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a9dbbf7f18d8c985bc4d4f450089eb4bb73e77dbf7168a1bb4c81f811e06d84
3
+ size 6903838
data/raw_data/perplexity-Research.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f27cb31cbab84f60efc3286592e84690fd117355dd84f9e4a9299108245c2a5
3
- size 1747979
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a3b855862c99f108abf97b9e402b43eb4d3376c3ec93c2e0a9c871b70d0736e
3
+ size 1747879
data/raw_data/{perplexity-sonar-pro.jsonl β†’ sonar-pro.jsonl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d577c0a208b35eb2c0454c00c70b12759cd8a1687f730f2133d8f392c1831ee
3
  size 750234
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39dbb2c1eae9fe1bc32abaedebe75bbc643ba18ec25e3360726ffb9d514c52ec
3
  size 750234
data/raw_data/{perplexity-sonar-reasoning-pro.jsonl β†’ sonar-reasoning-pro.jsonl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e08c6c4094bbf0aa1749e7b1a45e856a6635b2df6afdf0de8eeafea99e7477fc
3
  size 495156
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffa11b21375455127385b57c69442a2772aa96f0d5b410e274d994eacf920c00
3
  size 495156
data/raw_data/{perplexity-sonar-reasoning.jsonl β†’ sonar-reasoning.jsonl} RENAMED
File without changes
data/raw_data/{perplexity-sonar.jsonl β†’ sonar.jsonl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc0ef26282e404b700d56e158644f44228c49a3d5126fa12c8068e053444131e
3
  size 574856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96626a86dca8c67848e8ce9e71e76c4cddec7066ba209e2018d13781ad23f17f
3
  size 574856
data/raw_results/claude-3-5-sonnet-with-search.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0c47d1bab126886420bd53bb41a8905cdfb97f105711bcc2f5a27e3d53652ea
3
- size 1992421
 
 
 
 
data/raw_results/claude-3-5-sonnet-with-search/race_result.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Comprehensiveness: 0.2128
2
+ Insight: 0.1620
3
+ Instruction Following: 0.3241
4
+ Readability: 0.2987
5
+ Overall Score: 0.2395
data/raw_results/claude-3-5-sonnet-with-search/raw_results.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0423e9b7888c1070e43272284414d91deb490e38d48d0f7d6afc004db6fea291
3
+ size 52686
data/raw_results/claude-3-7-sonnet-with-search.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b3a6bf74400c89d24fa47853ab034ed3696ee0694c2d190ba83c3f5dcd8a0ef
3
- size 2002379
 
 
 
 
data/raw_results/claude-3-7-sonnet-with-search/race_result.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Comprehensiveness: 0.3595
2
+ Insight: 0.3129
3
+ Instruction Following: 0.4405
4
+ Readability: 0.3607
5
+ Overall Score: 0.3663
data/raw_results/claude-3-7-sonnet-with-search/raw_results.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fde53e238b0f658ccc09d1efb7d6a13e276cd9d67e9e722adc5531dc1561853
3
+ size 52517
data/raw_results/claude-research/race_result.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Comprehensiveness: 0.4534
2
+ Insight: 0.4279
3
+ Instruction Following: 0.4758
4
+ Readability: 0.4466
5
+ Overall Score: 0.4500
data/raw_results/claude-research/raw_results.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb6616b89c3b38eb5f4822cdf82326648652c3fb19b5aca5ada99552cba4f529
3
+ size 52454
data/raw_results/doubao-deepresearch/race_result.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Comprehensiveness: 0.4484
2
+ Insight: 0.4056
3
+ Instruction Following: 0.4795
4
+ Readability: 0.4469
5
+ Overall Score: 0.4434
data/raw_results/doubao-deepresearch/raw_results.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:732d86bd51bfb687821590ecc227cbce30663d309069e812d6e4bba103ca5e2a
3
+ size 51890
data/raw_results/gemini-2.5-flash-preview-04-17/race_result.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Comprehensiveness: 0.2897
2
+ Insight: 0.2162
3
+ Instruction Following: 0.3780
4
+ Readability: 0.2997
5
+ Overall Score: 0.2919
data/raw_results/gemini-2.5-flash-preview-04-17/raw_results.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ee20445df1bc4ca52e8c0de0d8f02d65e808df8f834b789fed749004e2bc3a4
3
+ size 52637
data/raw_results/gemini-2.5-flash-with-grounding.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:43b9f71819babb5c00f65f0dd71d707323fb803c585bd74976f49cdc34ab80aa
3
- size 1951481
 
 
 
 
data/raw_results/gemini-2.5-pro-deepresearch.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac2fc53c99697e3276c98d735ed630df6fa49d2972c70a5409adc1958ecaa7b7
3
- size 1937730
 
 
 
 
data/raw_results/gemini-2.5-pro-deepresearch/race_result.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Comprehensiveness: 0.4845
2
+ Insight: 0.4830
3
+ Instruction Following: 0.4929
4
+ Readability: 0.4977
5
+ Overall Score: 0.4892