Commit
·
de24ae3
1
Parent(s):
d48c1ec
update langchain&nvidia-aiq
Browse files- create_leaderboard.py +1 -1
- data/data_viewer.jsonl +2 -2
- data/leaderboard.csv +3 -1
- data/raw_data/gemini-2.5-pro-deepresearch.jsonl +2 -2
- data/raw_data/langchain-open-deep-research.jsonl +3 -0
- data/raw_data/nvidia-aiq-research-assistant.jsonl +3 -0
- data/raw_results/gemini-2.5-pro-deepresearch/race_result.txt +5 -5
- data/raw_results/gemini-2.5-pro-deepresearch/raw_results.jsonl +2 -2
- data/raw_results/langchain-open-deep-research/race_result.txt +5 -0
- data/raw_results/langchain-open-deep-research/raw_results.jsonl +3 -0
- data/raw_results/nvidia-aiq-research-assistant/race_result.txt +5 -0
- data/raw_results/nvidia-aiq-research-assistant/raw_results.jsonl +3 -0
- tabs/leaderboard_tab.py +85 -8
create_leaderboard.py
CHANGED
@@ -66,7 +66,7 @@ with gr.Blocks(title="DeepResearch Bench") as demo:
|
|
66 |
<a href="https://deepresearch-bench.github.io" target="_blank">Website</a> |
|
67 |
<a href="https://arxiv.org/abs/2506.11763" target="_blank">Paper</a> |
|
68 |
<a href="#" target="_blank">Eval Dataset</a> |
|
69 |
-
Total models:
|
70 |
<small style="color: #666; font-size: 0.9em;">
|
71 |
Race judge model: gemini-2.5-pro | Fact-checking models: gemini-2.5-flash
|
72 |
</small>
|
|
|
66 |
<a href="https://deepresearch-bench.github.io" target="_blank">Website</a> |
|
67 |
<a href="https://arxiv.org/abs/2506.11763" target="_blank">Paper</a> |
|
68 |
<a href="#" target="_blank">Eval Dataset</a> |
|
69 |
+
Total models: 21 | Last Update: 02 August 2025<br>
|
70 |
<small style="color: #666; font-size: 0.9em;">
|
71 |
Race judge model: gemini-2.5-pro | Fact-checking models: gemini-2.5-flash
|
72 |
</small>
|
data/data_viewer.jsonl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:252095068a525a2a8fd8be6566831a22e0feab22c41d057b7e7ceedba25ec4dd
|
3 |
+
size 47835963
|
data/leaderboard.csv
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
model,overall_score,comprehensiveness,insight,instruction_following,readability,citation_accuracy,effective_citations
|
2 |
-
gemini-2.5-pro-deepresearch,
|
3 |
openai-deepresearch,46.45,46.46,43.73,49.39,47.22,75.01,39.79
|
4 |
claude-research,45.00,45.34,42.79,47.58,44.66,-,-
|
5 |
kimi-researcher,44.64,44.96,41.97,47.14,45.59,-,-
|
6 |
doubao-deepresearch,44.34,44.84,40.56,47.95,44.69,52.86,52.62
|
|
|
|
|
7 |
perplexity-Research,40.46,39.10,35.65,46.11,43.08,82.63,31.20
|
8 |
grok-deeper-search,38.22,36.08,30.89,46.59,42.17,73.08,8.58
|
9 |
sonar-reasoning-pro,37.76,34.96,31.65,44.93,42.42,45.19,9.39
|
|
|
1 |
model,overall_score,comprehensiveness,insight,instruction_following,readability,citation_accuracy,effective_citations
|
2 |
+
gemini-2.5-pro-deepresearch,49.71,49.51,49.45,50.12,50.00,78.30,165.34
|
3 |
openai-deepresearch,46.45,46.46,43.73,49.39,47.22,75.01,39.79
|
4 |
claude-research,45.00,45.34,42.79,47.58,44.66,-,-
|
5 |
kimi-researcher,44.64,44.96,41.97,47.14,45.59,-,-
|
6 |
doubao-deepresearch,44.34,44.84,40.56,47.95,44.69,52.86,52.62
|
7 |
+
langchain-open-deep-research,43.44,42.97,39.17,48.09,45.22,-,-
|
8 |
+
nvidia-aiq-research-assistant,40.52,37.98,38.39,44.59,42.63,-,-
|
9 |
perplexity-Research,40.46,39.10,35.65,46.11,43.08,82.63,31.20
|
10 |
grok-deeper-search,38.22,36.08,30.89,46.59,42.17,73.08,8.58
|
11 |
sonar-reasoning-pro,37.76,34.96,31.65,44.93,42.42,45.19,9.39
|
data/raw_data/gemini-2.5-pro-deepresearch.jsonl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ead8c3cc42c3ea844e71be7bf21670a608feaa5a718695e3b215247d9198a80
|
3 |
+
size 8553888
|
data/raw_data/langchain-open-deep-research.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:83bfad2bdf5cb8de9593aaff20214f2588f8fedd8375f289a024c8ed69f2496a
|
3 |
+
size 1670658
|
data/raw_data/nvidia-aiq-research-assistant.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ccfba2939cf3724abece976196e7f44fd09f96cf85525bbb7f2eb371a8117f58
|
3 |
+
size 3865806
|
data/raw_results/gemini-2.5-pro-deepresearch/race_result.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
Comprehensiveness: 0.
|
2 |
-
Insight: 0.
|
3 |
-
Instruction Following: 0.
|
4 |
-
Readability: 0.
|
5 |
-
Overall Score: 0.
|
|
|
1 |
+
Comprehensiveness: 0.4951
|
2 |
+
Insight: 0.4945
|
3 |
+
Instruction Following: 0.5012
|
4 |
+
Readability: 0.5000
|
5 |
+
Overall Score: 0.4971
|
data/raw_results/gemini-2.5-pro-deepresearch/raw_results.jsonl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6e27c4da148eb17142ee86e35d66bf84884c72feae1713524962f4a199d3539b
|
3 |
+
size 52017
|
data/raw_results/langchain-open-deep-research/race_result.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Comprehensiveness: 0.4297
|
2 |
+
Insight: 0.3917
|
3 |
+
Instruction Following: 0.4809
|
4 |
+
Readability: 0.4522
|
5 |
+
Overall Score: 0.4344
|
data/raw_results/langchain-open-deep-research/raw_results.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d26af46e0e6829a2f73375c53c0e24f6b2b2f2e3fb7f923a43783135a041bb89
|
3 |
+
size 52395
|
data/raw_results/nvidia-aiq-research-assistant/race_result.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Comprehensiveness: 0.3798
|
2 |
+
Insight: 0.3839
|
3 |
+
Instruction Following: 0.4459
|
4 |
+
Readability: 0.4263
|
5 |
+
Overall Score: 0.4052
|
data/raw_results/nvidia-aiq-research-assistant/raw_results.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0317013a4b4c7832b097562ae485151db73b63e62fde2ce2f30d8b229bcecdd8
|
3 |
+
size 52468
|
tabs/leaderboard_tab.py
CHANGED
@@ -32,7 +32,9 @@ MODEL_CATEGORIES = {
|
|
32 |
"perplexity-Research",
|
33 |
"doubao-deepresearch",
|
34 |
"kimi-researcher",
|
35 |
-
"claude-research"
|
|
|
|
|
36 |
],
|
37 |
"LLM with Search": [
|
38 |
"claude-3-7-sonnet-with-search",
|
@@ -50,6 +52,62 @@ MODEL_CATEGORIES = {
|
|
50 |
]
|
51 |
}
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
def load_leaderboard() -> pd.DataFrame:
|
54 |
if not DATA_PATH.exists():
|
55 |
raise FileNotFoundError(
|
@@ -65,7 +123,11 @@ def load_leaderboard() -> pd.DataFrame:
|
|
65 |
return category
|
66 |
return "Others"
|
67 |
|
|
|
|
|
|
|
68 |
df['category'] = df['model'].apply(get_category)
|
|
|
69 |
return df
|
70 |
|
71 |
def make_ranked(df: pd.DataFrame) -> pd.DataFrame:
|
@@ -84,13 +146,25 @@ def make_ranked(df: pd.DataFrame) -> pd.DataFrame:
|
|
84 |
lambda x: round(float(x), 2) if x != "-" and pd.notna(x) else x
|
85 |
)
|
86 |
|
87 |
-
#
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
return ranked
|
96 |
|
@@ -172,6 +246,9 @@ def create_leaderboard_tab():
|
|
172 |
- **c.acc.**: Citation Accuracy - Correctness of references
|
173 |
- **eff.c.**: Effective Citations - Relevance and quality of sources
|
174 |
- **category**: Model category
|
|
|
|
|
|
|
175 |
""")
|
176 |
|
177 |
return search_box
|
|
|
32 |
"perplexity-Research",
|
33 |
"doubao-deepresearch",
|
34 |
"kimi-researcher",
|
35 |
+
"claude-research",
|
36 |
+
"nvidia-aiq-research-assistant",
|
37 |
+
"langchain-open-deep-research"
|
38 |
],
|
39 |
"LLM with Search": [
|
40 |
"claude-3-7-sonnet-with-search",
|
|
|
52 |
]
|
53 |
}
|
54 |
|
55 |
+
# 模型链接映射(目前都设置为空,可以后续添加具体链接)
|
56 |
+
MODEL_LINKS = {
|
57 |
+
# Deep Research Agent
|
58 |
+
"gemini-2.5-pro-deepresearch": "https://gemini.google/overview/deep-research/",
|
59 |
+
"grok-deeper-search": "https://x.ai/news/grok-3",
|
60 |
+
"openai-deepresearch": "https://openai.com/zh-Hans-CN/index/introducing-deep-research/",
|
61 |
+
"perplexity-Research": "https://www.perplexity.ai/hub/blog/introducing-perplexity-deep-research",
|
62 |
+
"doubao-deepresearch": "https://www.doubao.com/chat/",
|
63 |
+
"kimi-researcher": "https://moonshotai.github.io/Kimi-Researcher/",
|
64 |
+
"claude-research": "https://www.anthropic.com/news/research",
|
65 |
+
"nvidia-aiq-research-assistant": "https://github.com/NVIDIA-AI-Blueprints/aiq-research-assistant",
|
66 |
+
"langchain-open-deep-research": "https://github.com/langchain-ai/open_deep_research",
|
67 |
+
|
68 |
+
# LLM with Search
|
69 |
+
"claude-3-7-sonnet-with-search": "",
|
70 |
+
"claude-3-5-sonnet-with-search": "",
|
71 |
+
"sonar-reasoning-pro": "",
|
72 |
+
"sonar-reasoning": "",
|
73 |
+
"sonar-pro": "",
|
74 |
+
"sonar": "",
|
75 |
+
"gemini-2.5-pro-preview-05-06": "",
|
76 |
+
"gpt-4o-search-preview": "",
|
77 |
+
"gpt-4.1": "",
|
78 |
+
"gemini-2.5-flash-preview-04-17": "",
|
79 |
+
"gpt-4o-mini-search-preview": "",
|
80 |
+
"gpt-4.1-mini": ""
|
81 |
+
}
|
82 |
+
|
83 |
+
# 模型许可证类型映射
|
84 |
+
MODEL_LICENSE_TYPE = {
|
85 |
+
# Deep Research Agent
|
86 |
+
"gemini-2.5-pro-deepresearch": "Proprietary",
|
87 |
+
"grok-deeper-search": "Proprietary",
|
88 |
+
"openai-deepresearch": "Proprietary",
|
89 |
+
"perplexity-Research": "Proprietary",
|
90 |
+
"doubao-deepresearch": "Proprietary",
|
91 |
+
"kimi-researcher": "Proprietary",
|
92 |
+
"claude-research": "Proprietary",
|
93 |
+
"nvidia-aiq-research-assistant": "Apache 2.0",
|
94 |
+
"langchain-open-deep-research": "MIT", # 需要确认具体许可证
|
95 |
+
|
96 |
+
# LLM with Search
|
97 |
+
"claude-3-7-sonnet-with-search": "Proprietary",
|
98 |
+
"claude-3-5-sonnet-with-search": "Proprietary",
|
99 |
+
"sonar-reasoning-pro": "Proprietary",
|
100 |
+
"sonar-reasoning": "Proprietary",
|
101 |
+
"sonar-pro": "Proprietary",
|
102 |
+
"sonar": "Proprietary",
|
103 |
+
"gemini-2.5-pro-preview-05-06": "Proprietary",
|
104 |
+
"gpt-4o-search-preview": "Proprietary",
|
105 |
+
"gpt-4.1": "Proprietary",
|
106 |
+
"gemini-2.5-flash-preview-04-17": "Proprietary",
|
107 |
+
"gpt-4o-mini-search-preview": "Proprietary",
|
108 |
+
"gpt-4.1-mini": "Proprietary"
|
109 |
+
}
|
110 |
+
|
111 |
def load_leaderboard() -> pd.DataFrame:
|
112 |
if not DATA_PATH.exists():
|
113 |
raise FileNotFoundError(
|
|
|
123 |
return category
|
124 |
return "Others"
|
125 |
|
126 |
+
def get_license_type(model_name):
|
127 |
+
return MODEL_LICENSE_TYPE.get(model_name, "Unknown")
|
128 |
+
|
129 |
df['category'] = df['model'].apply(get_category)
|
130 |
+
df['license_type'] = df['model'].apply(get_license_type)
|
131 |
return df
|
132 |
|
133 |
def make_ranked(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
146 |
lambda x: round(float(x), 2) if x != "-" and pd.notna(x) else x
|
147 |
)
|
148 |
|
149 |
+
# 为模型添加链接和高亮样式
|
150 |
+
def format_model_name(row):
|
151 |
+
model_name = row['model']
|
152 |
+
link = MODEL_LINKS.get(model_name, "")
|
153 |
+
|
154 |
+
# 根据类别决定是否高亮
|
155 |
+
if row['category'] == CATEGORY_TO_HIGHLIGHT:
|
156 |
+
display_name = f'<span style="color: #823AFF;">{HIGHLIGHT_EMOJI} {model_name}</span>'
|
157 |
+
else:
|
158 |
+
display_name = model_name
|
159 |
+
|
160 |
+
# 如果有链接,包装成<a>标签
|
161 |
+
if link and link.strip():
|
162 |
+
return f'<a href="{link}" target="_blank" style="text-decoration: none;">{display_name}</a>'
|
163 |
+
else:
|
164 |
+
# 没有链接时,为将来添加链接做准备(可以添加点击事件等)
|
165 |
+
return f'<span class="model-name" data-model="{model_name}">{display_name}</span>'
|
166 |
+
|
167 |
+
ranked['model'] = ranked.apply(format_model_name, axis=1)
|
168 |
|
169 |
return ranked
|
170 |
|
|
|
246 |
- **c.acc.**: Citation Accuracy - Correctness of references
|
247 |
- **eff.c.**: Effective Citations - Relevance and quality of sources
|
248 |
- **category**: Model category
|
249 |
+
- **license_type**: The software license type of the model/service
|
250 |
+
|
251 |
+
💡 **Tip**: Model names are clickable when links are available. Visit the GitHub repositories for more details!
|
252 |
""")
|
253 |
|
254 |
return search_box
|