DeepResearch-Leaderboard / utils /rank_leaderboard.py
Ayanami0730's picture
Update latest data
1d11ffb
raw
history blame
6.11 kB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import os
import csv
from pathlib import Path
from collections import defaultdict
def parse_race_result(race_result_file):
"""解析race_result.txt文件获取各维度分数"""
scores = {}
with open(race_result_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if ':' in line:
key, value = line.split(':', 1)
key = key.strip()
value = float(value.strip())
if key == 'Comprehensiveness':
scores['comprehensiveness'] = value * 100
elif key == 'Insight':
scores['insight'] = value * 100
elif key == 'Instruction Following':
scores['instruction_following'] = value * 100
elif key == 'Readability':
scores['readability'] = value * 100
elif key == 'Overall Score':
scores['overall_score'] = value * 100
return scores
def parse_fact_result(fact_result_file):
"""解析fact_result.txt文件获取引用相关指标"""
citation_scores = {}
if not fact_result_file.exists():
return citation_scores
with open(fact_result_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if ':' in line:
key, value = line.split(':', 1)
key = key.strip()
value = float(value.strip())
if key == 'valid_rate':
citation_scores['citation_accuracy'] = value * 100
elif key == 'total_valid_citations':
citation_scores['effective_citations'] = value
elif key == 'supported_per_task':
citation_scores['effective_citations'] = value
return citation_scores
def process_model_data(model_dir):
"""处理单个模型文件夹的数据"""
model_name = model_dir.name
race_result_file = model_dir / "race_result.txt"
if not race_result_file.exists():
print(f"警告: 模型 {model_name} 的文件夹中未找到 race_result.txt")
return None
print(f"正在处理模型: {model_name}")
try:
scores = parse_race_result(race_result_file)
if not scores:
print(f" - 警告: 未能解析到有效分数")
return None
# 查找对应的fact_result.txt文件
project_root = Path(__file__).parent.parent
fact_results_dir = project_root / "data" / "fact_results"
fact_result_file = fact_results_dir / model_name / "fact_result.txt"
citation_scores = parse_fact_result(fact_result_file)
if citation_scores:
print(f" - 总分: {scores['overall_score']:.2f}, 引用准确率: {citation_scores.get('citation_accuracy', 'N/A'):.2f}%, 有效引用数: {citation_scores.get('effective_citations', 'N/A')}")
else:
print(f" - 总分: {scores['overall_score']:.2f}, 引用数据: 未找到")
result = {
'model': model_name,
'overall_score': scores['overall_score'],
'comprehensiveness': scores['comprehensiveness'],
'insight': scores['insight'],
'instruction_following': scores['instruction_following'],
'readability': scores['readability'],
'citation_accuracy': citation_scores.get('citation_accuracy', None),
'effective_citations': citation_scores.get('effective_citations', None)
}
return result
except Exception as e:
print(f" - 错误: 处理文件时出错: {e}")
return None
def rank_leaderboard():
"""计算排行榜并保存到CSV"""
project_root = Path(__file__).parent.parent
input_dir = project_root / "data" / "raw_results"
output_file = project_root / "data" / "leaderboard.csv"
model_dirs = [d for d in input_dir.iterdir() if d.is_dir()]
print(f"找到 {len(model_dirs)} 个模型文件夹")
if not model_dirs:
print("未找到任何模型文件夹")
return
model_results = []
for model_dir in model_dirs:
try:
result = process_model_data(model_dir)
if result:
model_results.append(result)
except Exception as e:
print(f"处理文件夹 {model_dir.name} 时出错: {e}")
continue
# 按overall_score排序
model_results.sort(key=lambda x: x['overall_score'], reverse=True)
# 写入CSV文件
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['model', 'overall_score', 'comprehensiveness', 'insight', 'instruction_following', 'readability', 'citation_accuracy', 'effective_citations']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for result in model_results:
# 格式化数值,对于None值使用"-"
row = {
'model': result['model'],
'overall_score': f"{result['overall_score']:.2f}",
'comprehensiveness': f"{result['comprehensiveness']:.2f}",
'insight': f"{result['insight']:.2f}",
'instruction_following': f"{result['instruction_following']:.2f}",
'readability': f"{result['readability']:.2f}",
'citation_accuracy': f"{result['citation_accuracy']:.2f}" if result['citation_accuracy'] is not None else "-",
'effective_citations': f"{result['effective_citations']:.2f}" if result['effective_citations'] is not None else "-"
}
writer.writerow(row)
print(f"\n排行榜已保存到: {output_file}")
print(f"共处理了 {len(model_results)} 个模型")
if __name__ == "__main__":
rank_leaderboard()
print("排行榜计算完成!")