Ayanami0730 commited on
Commit
de24ae3
·
1 Parent(s): d48c1ec

update langchain&nvidia-aiq

Browse files
create_leaderboard.py CHANGED
@@ -66,7 +66,7 @@ with gr.Blocks(title="DeepResearch Bench") as demo:
66
  <a href="https://deepresearch-bench.github.io" target="_blank">Website</a> |
67
  <a href="https://arxiv.org/abs/2506.11763" target="_blank">Paper</a> |
68
  <a href="#" target="_blank">Eval Dataset</a> |
69
- Total models: 19 | Last Update: 15 July 2025<br>
70
  <small style="color: #666; font-size: 0.9em;">
71
  Race judge model: gemini-2.5-pro | Fact-checking models: gemini-2.5-flash
72
  </small>
 
66
  <a href="https://deepresearch-bench.github.io" target="_blank">Website</a> |
67
  <a href="https://arxiv.org/abs/2506.11763" target="_blank">Paper</a> |
68
  <a href="#" target="_blank">Eval Dataset</a> |
69
+ Total models: 21 | Last Update: 02 August 2025<br>
70
  <small style="color: #666; font-size: 0.9em;">
71
  Race judge model: gemini-2.5-pro | Fact-checking models: gemini-2.5-flash
72
  </small>
data/data_viewer.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffd1a662c13226e50fe9e690d43e4575ba524e6e73f77d4195d4f012adcee642
3
- size 42227460
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:252095068a525a2a8fd8be6566831a22e0feab22c41d057b7e7ceedba25ec4dd
3
+ size 47835963
data/leaderboard.csv CHANGED
@@ -1,9 +1,11 @@
1
  model,overall_score,comprehensiveness,insight,instruction_following,readability,citation_accuracy,effective_citations
2
- gemini-2.5-pro-deepresearch,48.92,48.45,48.30,49.29,49.77,78.30,165.34
3
  openai-deepresearch,46.45,46.46,43.73,49.39,47.22,75.01,39.79
4
  claude-research,45.00,45.34,42.79,47.58,44.66,-,-
5
  kimi-researcher,44.64,44.96,41.97,47.14,45.59,-,-
6
  doubao-deepresearch,44.34,44.84,40.56,47.95,44.69,52.86,52.62
 
 
7
  perplexity-Research,40.46,39.10,35.65,46.11,43.08,82.63,31.20
8
  grok-deeper-search,38.22,36.08,30.89,46.59,42.17,73.08,8.58
9
  sonar-reasoning-pro,37.76,34.96,31.65,44.93,42.42,45.19,9.39
 
1
  model,overall_score,comprehensiveness,insight,instruction_following,readability,citation_accuracy,effective_citations
2
+ gemini-2.5-pro-deepresearch,49.71,49.51,49.45,50.12,50.00,78.30,165.34
3
  openai-deepresearch,46.45,46.46,43.73,49.39,47.22,75.01,39.79
4
  claude-research,45.00,45.34,42.79,47.58,44.66,-,-
5
  kimi-researcher,44.64,44.96,41.97,47.14,45.59,-,-
6
  doubao-deepresearch,44.34,44.84,40.56,47.95,44.69,52.86,52.62
7
+ langchain-open-deep-research,43.44,42.97,39.17,48.09,45.22,-,-
8
+ nvidia-aiq-research-assistant,40.52,37.98,38.39,44.59,42.63,-,-
9
  perplexity-Research,40.46,39.10,35.65,46.11,43.08,82.63,31.20
10
  grok-deeper-search,38.22,36.08,30.89,46.59,42.17,73.08,8.58
11
  sonar-reasoning-pro,37.76,34.96,31.65,44.93,42.42,45.19,9.39
data/raw_data/gemini-2.5-pro-deepresearch.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3705106cc42a38b7c7bcd90e42aaec7f688a0d52179329e3074fcda99ea544e7
3
- size 8523153
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ead8c3cc42c3ea844e71be7bf21670a608feaa5a718695e3b215247d9198a80
3
+ size 8553888
data/raw_data/langchain-open-deep-research.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83bfad2bdf5cb8de9593aaff20214f2588f8fedd8375f289a024c8ed69f2496a
3
+ size 1670658
data/raw_data/nvidia-aiq-research-assistant.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccfba2939cf3724abece976196e7f44fd09f96cf85525bbb7f2eb371a8117f58
3
+ size 3865806
data/raw_results/gemini-2.5-pro-deepresearch/race_result.txt CHANGED
@@ -1,5 +1,5 @@
1
- Comprehensiveness: 0.4845
2
- Insight: 0.4830
3
- Instruction Following: 0.4929
4
- Readability: 0.4977
5
- Overall Score: 0.4892
 
1
+ Comprehensiveness: 0.4951
2
+ Insight: 0.4945
3
+ Instruction Following: 0.5012
4
+ Readability: 0.5000
5
+ Overall Score: 0.4971
data/raw_results/gemini-2.5-pro-deepresearch/raw_results.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34e1afac1851c1e81b65f1f3844aa8da886ef20558d7891ce7145f7c63cc53ca
3
- size 51986
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e27c4da148eb17142ee86e35d66bf84884c72feae1713524962f4a199d3539b
3
+ size 52017
data/raw_results/langchain-open-deep-research/race_result.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Comprehensiveness: 0.4297
2
+ Insight: 0.3917
3
+ Instruction Following: 0.4809
4
+ Readability: 0.4522
5
+ Overall Score: 0.4344
data/raw_results/langchain-open-deep-research/raw_results.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d26af46e0e6829a2f73375c53c0e24f6b2b2f2e3fb7f923a43783135a041bb89
3
+ size 52395
data/raw_results/nvidia-aiq-research-assistant/race_result.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Comprehensiveness: 0.3798
2
+ Insight: 0.3839
3
+ Instruction Following: 0.4459
4
+ Readability: 0.4263
5
+ Overall Score: 0.4052
data/raw_results/nvidia-aiq-research-assistant/raw_results.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0317013a4b4c7832b097562ae485151db73b63e62fde2ce2f30d8b229bcecdd8
3
+ size 52468
tabs/leaderboard_tab.py CHANGED
@@ -32,7 +32,9 @@ MODEL_CATEGORIES = {
32
  "perplexity-Research",
33
  "doubao-deepresearch",
34
  "kimi-researcher",
35
- "claude-research"
 
 
36
  ],
37
  "LLM with Search": [
38
  "claude-3-7-sonnet-with-search",
@@ -50,6 +52,62 @@ MODEL_CATEGORIES = {
50
  ]
51
  }
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def load_leaderboard() -> pd.DataFrame:
54
  if not DATA_PATH.exists():
55
  raise FileNotFoundError(
@@ -65,7 +123,11 @@ def load_leaderboard() -> pd.DataFrame:
65
  return category
66
  return "Others"
67
 
 
 
 
68
  df['category'] = df['model'].apply(get_category)
 
69
  return df
70
 
71
  def make_ranked(df: pd.DataFrame) -> pd.DataFrame:
@@ -84,13 +146,25 @@ def make_ranked(df: pd.DataFrame) -> pd.DataFrame:
84
  lambda x: round(float(x), 2) if x != "-" and pd.notna(x) else x
85
  )
86
 
87
- # 为 Deep Research Agent 添加 HTML 格式(加粗 + 颜色)
88
- ranked['model'] = ranked.apply(
89
- lambda row: f'<span style="color: #823AFF;">{HIGHLIGHT_EMOJI} {row["model"]}</span>'
90
- if row['category'] == CATEGORY_TO_HIGHLIGHT
91
- else row['model'],
92
- axis=1
93
- )
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  return ranked
96
 
@@ -172,6 +246,9 @@ def create_leaderboard_tab():
172
  - **c.acc.**: Citation Accuracy - Correctness of references
173
  - **eff.c.**: Effective Citations - Relevance and quality of sources
174
  - **category**: Model category
 
 
 
175
  """)
176
 
177
  return search_box
 
32
  "perplexity-Research",
33
  "doubao-deepresearch",
34
  "kimi-researcher",
35
+ "claude-research",
36
+ "nvidia-aiq-research-assistant",
37
+ "langchain-open-deep-research"
38
  ],
39
  "LLM with Search": [
40
  "claude-3-7-sonnet-with-search",
 
52
  ]
53
  }
54
 
55
+ # 模型链接映射(目前都设置为空,可以后续添加具体链接)
56
+ MODEL_LINKS = {
57
+ # Deep Research Agent
58
+ "gemini-2.5-pro-deepresearch": "https://gemini.google/overview/deep-research/",
59
+ "grok-deeper-search": "https://x.ai/news/grok-3",
60
+ "openai-deepresearch": "https://openai.com/zh-Hans-CN/index/introducing-deep-research/",
61
+ "perplexity-Research": "https://www.perplexity.ai/hub/blog/introducing-perplexity-deep-research",
62
+ "doubao-deepresearch": "https://www.doubao.com/chat/",
63
+ "kimi-researcher": "https://moonshotai.github.io/Kimi-Researcher/",
64
+ "claude-research": "https://www.anthropic.com/news/research",
65
+ "nvidia-aiq-research-assistant": "https://github.com/NVIDIA-AI-Blueprints/aiq-research-assistant",
66
+ "langchain-open-deep-research": "https://github.com/langchain-ai/open_deep_research",
67
+
68
+ # LLM with Search
69
+ "claude-3-7-sonnet-with-search": "",
70
+ "claude-3-5-sonnet-with-search": "",
71
+ "sonar-reasoning-pro": "",
72
+ "sonar-reasoning": "",
73
+ "sonar-pro": "",
74
+ "sonar": "",
75
+ "gemini-2.5-pro-preview-05-06": "",
76
+ "gpt-4o-search-preview": "",
77
+ "gpt-4.1": "",
78
+ "gemini-2.5-flash-preview-04-17": "",
79
+ "gpt-4o-mini-search-preview": "",
80
+ "gpt-4.1-mini": ""
81
+ }
82
+
83
+ # 模型许可证类型映射
84
+ MODEL_LICENSE_TYPE = {
85
+ # Deep Research Agent
86
+ "gemini-2.5-pro-deepresearch": "Proprietary",
87
+ "grok-deeper-search": "Proprietary",
88
+ "openai-deepresearch": "Proprietary",
89
+ "perplexity-Research": "Proprietary",
90
+ "doubao-deepresearch": "Proprietary",
91
+ "kimi-researcher": "Proprietary",
92
+ "claude-research": "Proprietary",
93
+ "nvidia-aiq-research-assistant": "Apache 2.0",
94
+ "langchain-open-deep-research": "MIT", # 需要确认具体许可证
95
+
96
+ # LLM with Search
97
+ "claude-3-7-sonnet-with-search": "Proprietary",
98
+ "claude-3-5-sonnet-with-search": "Proprietary",
99
+ "sonar-reasoning-pro": "Proprietary",
100
+ "sonar-reasoning": "Proprietary",
101
+ "sonar-pro": "Proprietary",
102
+ "sonar": "Proprietary",
103
+ "gemini-2.5-pro-preview-05-06": "Proprietary",
104
+ "gpt-4o-search-preview": "Proprietary",
105
+ "gpt-4.1": "Proprietary",
106
+ "gemini-2.5-flash-preview-04-17": "Proprietary",
107
+ "gpt-4o-mini-search-preview": "Proprietary",
108
+ "gpt-4.1-mini": "Proprietary"
109
+ }
110
+
111
  def load_leaderboard() -> pd.DataFrame:
112
  if not DATA_PATH.exists():
113
  raise FileNotFoundError(
 
123
  return category
124
  return "Others"
125
 
126
+ def get_license_type(model_name):
127
+ return MODEL_LICENSE_TYPE.get(model_name, "Unknown")
128
+
129
  df['category'] = df['model'].apply(get_category)
130
+ df['license_type'] = df['model'].apply(get_license_type)
131
  return df
132
 
133
  def make_ranked(df: pd.DataFrame) -> pd.DataFrame:
 
146
  lambda x: round(float(x), 2) if x != "-" and pd.notna(x) else x
147
  )
148
 
149
+ # 为模型添加链接和高亮样式
150
+ def format_model_name(row):
151
+ model_name = row['model']
152
+ link = MODEL_LINKS.get(model_name, "")
153
+
154
+ # 根据类别决定是否高亮
155
+ if row['category'] == CATEGORY_TO_HIGHLIGHT:
156
+ display_name = f'<span style="color: #823AFF;">{HIGHLIGHT_EMOJI} {model_name}</span>'
157
+ else:
158
+ display_name = model_name
159
+
160
+ # 如果有链接,包装成<a>标签
161
+ if link and link.strip():
162
+ return f'<a href="{link}" target="_blank" style="text-decoration: none;">{display_name}</a>'
163
+ else:
164
+ # 没有链接时,为将来添加链接做准备(可以添加点击事件等)
165
+ return f'<span class="model-name" data-model="{model_name}">{display_name}</span>'
166
+
167
+ ranked['model'] = ranked.apply(format_model_name, axis=1)
168
 
169
  return ranked
170
 
 
246
  - **c.acc.**: Citation Accuracy - Correctness of references
247
  - **eff.c.**: Effective Citations - Relevance and quality of sources
248
  - **category**: Model category
249
+ - **license_type**: The software license type of the model/service
250
+
251
+ 💡 **Tip**: Model names are clickable when links are available. Visit the GitHub repositories for more details!
252
  """)
253
 
254
  return search_box