#!/usr/bin/env python3 # -*- coding: utf-8 -*- import json import os from pathlib import Path def load_scores_for_model(model_results_dir: Path): scores_by_id = {} raw_results_file = model_results_dir / "raw_results.jsonl" if not raw_results_file.exists(): print(f"警告: 未找到模型 {model_results_dir.name} 的结果文件: {raw_results_file}") return scores_by_id print(f" 正在从 {model_results_dir.name}/raw_results.jsonl 加载分数...") with open(raw_results_file, 'r', encoding='utf-8') as f: for i, line in enumerate(f): try: data = json.loads(line.strip()) article_id = str(data.get('id')) if not article_id: print(f" 警告: {model_results_dir.name} 第 {i+1} 行缺少ID,已跳过。") continue overall_score_raw = data.get('overall_score', 0.0) overall_score_scaled = overall_score_raw * 100 comprehensiveness_score_raw = data.get('comprehensiveness', 0.0) insight_score_raw = data.get('insight', 0.0) instruction_score_raw = data.get('instruction_following', 0.0) readability_score_raw = data.get('readability', 0.0) scores_by_id[article_id] = { 'overall_score': f"{overall_score_scaled:.2f}", 'comprehensiveness_score': f"{comprehensiveness_score_raw * 100:.2f}", 'insight_score': f"{insight_score_raw * 100:.2f}", 'instruction_following_score': f"{instruction_score_raw * 100:.2f}", 'readability_score': f"{readability_score_raw * 100:.2f}" } except json.JSONDecodeError as e: print(f" 错误: 解析JSON时出错 (文件: {model_results_dir.name}, 行: {i+1}): {e}") except Exception as e: print(f" 错误: 处理数据时出错 (文件: {model_results_dir.name}, 行: {i+1}): {e}") print(f" 为模型 {model_results_dir.name} 加载了 {len(scores_by_id)}篇文章的分数") return scores_by_id def merge_jsonl_files(): project_root = Path(__file__).resolve().parent.parent raw_data_dir = project_root / "data" / "raw_data" raw_results_dir = project_root / "data" / "raw_results" output_file = project_root / "data" / "data_viewer.jsonl" input_files = list(raw_data_dir.glob("*.jsonl")) print(f"在 {raw_data_dir} 中找到 {len(input_files)} 个模型JSONL文件") if not input_files: print("未找到任何原始数据文件,已退出。") return with open(output_file, 'w', encoding='utf-8') as f: pass all_merged_data = [] for raw_data_file in input_files: model_name = raw_data_file.stem print(f"正在处理原始数据文件: {raw_data_file.name} (模型: {model_name})") model_results_dir = raw_results_dir / model_name if not model_results_dir.exists(): print(f" 警告: 未找到模型 {model_name} 对应的结果文件夹: {model_results_dir}") continue scores_for_current_model = load_scores_for_model(model_results_dir) processed_articles_count = 0 with open(raw_data_file, 'r', encoding='utf-8') as f_raw: for i, line in enumerate(f_raw): try: article_data = json.loads(line.strip()) article_id = str(article_data.get('id')) if not article_id: print(f" 警告: {raw_data_file.name} 第 {i+1} 行缺少ID,已跳过。") continue article_scores = scores_for_current_model.get(article_id, {}) if not article_scores: print(f" 警告: 模型 {model_name} 的文章ID {article_id} 未在结果文件中找到分数。") merged_item = { 'model_name': model_name, 'id': article_id, 'prompt': article_data.get('prompt'), 'article': article_data.get('article'), 'overall_score': article_scores.get('overall_score'), 'comprehensiveness_score': article_scores.get('comprehensiveness_score'), 'insight_score': article_scores.get('insight_score'), 'instruction_following_score': article_scores.get('instruction_following_score'), 'readability_score': article_scores.get('readability_score') } all_merged_data.append(merged_item) processed_articles_count += 1 except json.JSONDecodeError as e: print(f" 错误: 解析原始数据JSON时出错 (文件: {raw_data_file.name}, 行: {i+1}): {e}") except Exception as e: print(f" 错误: 处理原始数据时出错 (文件: {raw_data_file.name}, 行: {i+1}): {e}") print(f" 为模型 {model_name} 处理了 {processed_articles_count} 篇文章数据。") with open(output_file, 'w', encoding='utf-8') as f_out: for item in all_merged_data: f_out.write(json.dumps(item, ensure_ascii=False) + '\n') print(f"\n成功合并并保存到: {output_file}, 共 {len(all_merged_data)} 条记录") if __name__ == "__main__": merge_jsonl_files() print("所有文件处理完成!")