Spaces:

Ayanami0730
/

DeepResearch-Leaderboard

Running

App Files Files Community

DeepResearch-Leaderboard / utils /rank_leaderboard.py

Ayanami0730

Update latest data

1d11ffb about 2 months ago

raw

history blame

6.11 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --

	import json
	import os
	import csv
	from pathlib import Path
	from collections import defaultdict


	def parse_race_result(race_result_file):
	"""解析race_result.txt文件获取各维度分数"""
	scores = {}

	with open(race_result_file, 'r', encoding='utf-8') as f:
	for line in f:
	line = line.strip()
	if ':' in line:
	key, value = line.split(':', 1)
	key = key.strip()
	value = float(value.strip())

	if key == 'Comprehensiveness':
	scores['comprehensiveness'] = value * 100
	elif key == 'Insight':
	scores['insight'] = value * 100
	elif key == 'Instruction Following':
	scores['instruction_following'] = value * 100
	elif key == 'Readability':
	scores['readability'] = value * 100
	elif key == 'Overall Score':
	scores['overall_score'] = value * 100

	return scores


	def parse_fact_result(fact_result_file):
	"""解析fact_result.txt文件获取引用相关指标"""
	citation_scores = {}

	if not fact_result_file.exists():
	return citation_scores

	with open(fact_result_file, 'r', encoding='utf-8') as f:
	for line in f:
	line = line.strip()
	if ':' in line:
	key, value = line.split(':', 1)
	key = key.strip()
	value = float(value.strip())

	if key == 'valid_rate':
	citation_scores['citation_accuracy'] = value * 100
	elif key == 'total_valid_citations':
	citation_scores['effective_citations'] = value
	elif key == 'supported_per_task':
	citation_scores['effective_citations'] = value

	return citation_scores


	def process_model_data(model_dir):
	"""处理单个模型文件夹的数据"""
	model_name = model_dir.name
	race_result_file = model_dir / "race_result.txt"

	if not race_result_file.exists():
	print(f"警告: 模型 {model_name} 的文件夹中未找到 race_result.txt")
	return None

	print(f"正在处理模型: {model_name}")

	try:
	scores = parse_race_result(race_result_file)

	if not scores:
	print(f" - 警告: 未能解析到有效分数")
	return None

	# 查找对应的fact_result.txt文件
	project_root = Path(__file__).parent.parent
	fact_results_dir = project_root / "data" / "fact_results"
	fact_result_file = fact_results_dir / model_name / "fact_result.txt"

	citation_scores = parse_fact_result(fact_result_file)

	if citation_scores:
	print(f" - 总分: {scores['overall_score']:.2f}, 引用准确率: {citation_scores.get('citation_accuracy', 'N/A'):.2f}%, 有效引用数: {citation_scores.get('effective_citations', 'N/A')}")
	else:
	print(f" - 总分: {scores['overall_score']:.2f}, 引用数据: 未找到")

	result = {
	'model': model_name,
	'overall_score': scores['overall_score'],
	'comprehensiveness': scores['comprehensiveness'],
	'insight': scores['insight'],
	'instruction_following': scores['instruction_following'],
	'readability': scores['readability'],
	'citation_accuracy': citation_scores.get('citation_accuracy', None),
	'effective_citations': citation_scores.get('effective_citations', None)
	}

	return result

	except Exception as e:
	print(f" - 错误: 处理文件时出错: {e}")
	return None


	def rank_leaderboard():
	"""计算排行榜并保存到CSV"""
	project_root = Path(__file__).parent.parent
	input_dir = project_root / "data" / "raw_results"
	output_file = project_root / "data" / "leaderboard.csv"

	model_dirs = [d for d in input_dir.iterdir() if d.is_dir()]
	print(f"找到 {len(model_dirs)} 个模型文件夹")

	if not model_dirs:
	print("未找到任何模型文件夹")
	return

	model_results = []
	for model_dir in model_dirs:
	try:
	result = process_model_data(model_dir)
	if result:
	model_results.append(result)
	except Exception as e:
	print(f"处理文件夹 {model_dir.name} 时出错: {e}")
	continue

	# 按overall_score排序
	model_results.sort(key=lambda x: x['overall_score'], reverse=True)

	# 写入CSV文件
	with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
	fieldnames = ['model', 'overall_score', 'comprehensiveness', 'insight', 'instruction_following', 'readability', 'citation_accuracy', 'effective_citations']
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

	writer.writeheader()

	for result in model_results:
	# 格式化数值，对于None值使用"-"
	row = {
	'model': result['model'],
	'overall_score': f"{result['overall_score']:.2f}",
	'comprehensiveness': f"{result['comprehensiveness']:.2f}",
	'insight': f"{result['insight']:.2f}",
	'instruction_following': f"{result['instruction_following']:.2f}",
	'readability': f"{result['readability']:.2f}",
	'citation_accuracy': f"{result['citation_accuracy']:.2f}" if result['citation_accuracy'] is not None else "-",
	'effective_citations': f"{result['effective_citations']:.2f}" if result['effective_citations'] is not None else "-"
	}
	writer.writerow(row)

	print(f"\n排行榜已保存到: {output_file}")
	print(f"共处理了 {len(model_results)} 个模型")


	if __name__ == "__main__":
	rank_leaderboard()
	print("排行榜计算完成！")