neural-mesh-v2 / test /batch_evaluate_testtime.py
hjkim00's picture
Restore all essential files - code, configs, and MBPP/HumanEval data
24c2665 verified
raw
history blame
99.1 kB
#!/usr/bin/env python3
"""
Batch TestTime RLVR Evaluation Script
๋ฒค์น˜๋งˆํฌ ์ „์ฒด์— ๋Œ€ํ•˜์—ฌ TestTime RLVR ํŒŒ์ดํ”„๋ผ์ธ์„ ์‹คํ–‰ํ•˜๊ณ 
์ดˆ๊ธฐ ์†”๋ฃจ์…˜ ์ •ํ™•์„ฑ ๋ฐ reasoning tasks ์„ฑ๋Šฅ์„ ํ‰๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.
"""
import os
import sys
import json
import argparse
import time
import re
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Any
import traceback
# TestTime RLVR ๋ชจ๋“ˆ ์ž„ํฌํŠธ
sys.path.append('/home/ubuntu/RLVR/TestTime-RLVR-v2')
from absolute_zero_reasoner.testtime.complete_pipeline import CompleteTestTimePipeline
from absolute_zero_reasoner.testtime.config import TestTimeConfig, BenchmarkConfig
from absolute_zero_reasoner.testtime.logger import TestTimeLogger
from absolute_zero_reasoner.testtime.solution_generator import InitialSolutionGenerator
from absolute_zero_reasoner.testtime.prompts import get_prompt, get_diversity_instruction
def generate_detailed_classification(output_dir: str, benchmark: str) -> str:
"""๋ฐฐ์น˜ ํ‰๊ฐ€ ๊ฒฐ๊ณผ๋ฅผ 4๊ฐ€์ง€ ์นดํ…Œ๊ณ ๋ฆฌ๋กœ ์ƒ์„ธ ๋ถ„๋ฅ˜"""
base_dir = os.path.join(output_dir, benchmark)
if not os.path.exists(base_dir):
return f"## ๐Ÿ“Š Detailed Problem Classification\n\nโš ๏ธ Benchmark directory not found: {base_dir}\n\n"
# 4๊ฐ€์ง€ ์นดํ…Œ๊ณ ๋ฆฌ
complete_success = [] # 100% ์„ฑ๊ณต
partial_success = [] # ๋ถ€๋ถ„ ์„ฑ๊ณต (success rate์™€ ํ•จ๊ป˜)
complete_failure = [] # 0% ์‹คํŒจ
execution_failure = [] # ์‹คํ–‰ ์‹คํŒจ (division by zero ๋“ฑ)
# ๋ชจ๋“  problem ๋””๋ ‰ํ† ๋ฆฌ ํƒ์ƒ‰
for problem_dir in sorted(Path(base_dir).iterdir()):
if not problem_dir.is_dir():
continue
problem_id = problem_dir.name
# current_evaluation ๋””๋ ‰ํ† ๋ฆฌ ํ™•์ธ (baseline evaluation ๊ธฐ์ค€)
current_eval_file = problem_dir / "current_evaluation" / "attempt_1.txt"
if not current_eval_file.exists():
execution_failure.append(f"{problem_id} (file not found)")
continue
# ํŒŒ์ผ์—์„œ ๊ฒฐ๊ณผ ์ถ”์ถœ
try:
with open(current_eval_file, 'r', encoding='utf-8') as f:
content = f.read()
# Result ๋ผ์ธ ์ฐพ๊ธฐ
result_pattern = r'Result: (.+) \((\d+)/(\d+) tests passed\)'
match = re.search(result_pattern, content)
if match:
status = match.group(1)
passed = int(match.group(2))
total = int(match.group(3))
if total == 0:
execution_failure.append(f"{problem_id} (0 total tests)")
elif passed == total:
complete_success.append(problem_id)
elif passed == 0:
complete_failure.append(problem_id)
else:
ratio = passed / total * 100
partial_success.append((problem_id, passed, total, ratio))
else:
execution_failure.append(f"{problem_id} (no result pattern)")
except Exception as e:
if "division by zero" in str(e):
execution_failure.append(f"{problem_id} (division by zero)")
else:
execution_failure.append(f"{problem_id} (error: {str(e)[:50]})")
# Partial Success๋ฅผ ์„ฑ๊ณต๋ฅ  ๋‚ฎ์€ ์ˆœ์„œ๋กœ ์ •๋ ฌ
partial_success.sort(key=lambda x: x[3]) # ratio๋กœ ์ •๋ ฌ
# Markdown ํ˜•์‹์œผ๋กœ ๊ฒฐ๊ณผ ์ƒ์„ฑ
result = "## ๐Ÿ“Š Detailed Problem Classification\n\n"
result += f"### ๐ŸŸข Complete Success (Baseline = 100%)\n"
result += f"**Count: {len(complete_success)} problems**\n"
result += "**Task IDs:**\n"
# 10๊ฐœ์”ฉ ํ•œ ์ค„์— ์ถœ๋ ฅ
for i in range(0, len(complete_success), 10):
line_tasks = complete_success[i:i+10]
result += "- " + ", ".join(line_tasks) + "\n"
result += "\n"
result += f"### ๐ŸŸก Partial Success (0% < Baseline < 100%)\n"
result += f"**Count: {len(partial_success)} problems**\n"
result += "**Task IDs (ordered by success rate, lowest first):**\n"
for problem_id, passed, total, ratio in partial_success:
result += f"- {problem_id}: {passed}/{total} ({ratio:.1f}%)\n"
result += "\n"
result += f"### ๐Ÿ”ด Complete Failure (Baseline = 0%)\n"
result += f"**Count: {len(complete_failure)} problems**\n"
result += "**Task IDs:**\n"
# 10๊ฐœ์”ฉ ํ•œ ์ค„์— ์ถœ๋ ฅ
for i in range(0, len(complete_failure), 10):
line_tasks = complete_failure[i:i+10]
result += "- " + ", ".join(line_tasks) + "\n"
result += "\n"
result += f"### โŒ Execution Failure (Syntax/Import/Runtime Errors)\n"
result += f"**Count: {len(execution_failure)} problems**\n"
result += "**Task IDs:**\n"
for task in execution_failure:
result += f"- {task}\n"
result += "\n"
result += f"### ๐Ÿ“ˆ Summary Statistics\n"
total_analyzed = len(complete_success) + len(partial_success) + len(complete_failure) + len(execution_failure)
if total_analyzed > 0:
result += f"- Total Problems with Results: {total_analyzed}\n"
result += f"- Baseline Success Rate: {len(complete_success)/total_analyzed*100:.1f}%\n"
result += f"- Partial Success Rate: {len(partial_success)/total_analyzed*100:.1f}%\n"
result += f"- Complete Failure Rate: {len(complete_failure)/total_analyzed*100:.1f}%\n"
result += f"- Execution Failure Rate: {len(execution_failure)/total_analyzed*100:.1f}%\n"
result += f"\n**Note**: This analysis is based on baseline evaluation (attempt_1.txt) results.\n"
result += f"Problems that failed during early pipeline stages may not appear in these statistics.\n"
result += "\n"
return result
def load_benchmark_problems(benchmark_config: BenchmarkConfig) -> List[str]:
"""๋ฒค์น˜๋งˆํฌ์—์„œ ๋ฌธ์ œ ID ๋ชฉ๋ก ๋กœ๋“œ (EvalPlus ํ‘œ์ค€ ๋ฐฉ์‹ ์‚ฌ์šฉ)"""
problems = []
if benchmark_config.name == 'mbpp':
# MBPP+ EvalPlus ํ‘œ์ค€ ๋ฐ์ดํ„ฐ ๋กœ๋”ฉ
try:
from evalplus.data.mbpp import get_mbpp_plus
mbpp_problems = get_mbpp_plus() # ์ž๋™์œผ๋กœ mbpp_deserialize_inputs ์ ์šฉ๋จ
problems = list(mbpp_problems.keys())
print(f"โœ… MBPP+ ๋ฐ์ดํ„ฐ ๋กœ๋“œ ์„ฑ๊ณต: {len(problems)}๊ฐœ ๋ฌธ์ œ (EvalPlus ํ‘œ์ค€ ๋ฐฉ์‹)")
except Exception as e:
print(f"โŒ MBPP+ EvalPlus ๋กœ๋”ฉ ์‹คํŒจ, ๊ธฐ์กด ๋ฐฉ์‹ ์‚ฌ์šฉ: {e}")
# Fallback to original method
data_path = benchmark_config.data_path
if os.path.exists(data_path):
with open(data_path, 'r') as f:
for line in f:
try:
data = json.loads(line.strip())
if 'task_id' in data:
problems.append(data['task_id'])
except:
continue
elif benchmark_config.name == 'humaneval':
# HumanEval+ EvalPlus ํ‘œ์ค€ ๋ฐ์ดํ„ฐ ๋กœ๋”ฉ
try:
from evalplus.data.humaneval import get_human_eval_plus
humaneval_problems = get_human_eval_plus() # EvalPlus ํ‘œ์ค€ ๋ฐฉ์‹
problems = list(humaneval_problems.keys())
print(f"โœ… HumanEval+ ๋ฐ์ดํ„ฐ ๋กœ๋“œ ์„ฑ๊ณต: {len(problems)}๊ฐœ ๋ฌธ์ œ (EvalPlus ํ‘œ์ค€ ๋ฐฉ์‹)")
except Exception as e:
print(f"โŒ HumanEval+ EvalPlus ๋กœ๋”ฉ ์‹คํŒจ, ๊ธฐ์กด ๋ฐฉ์‹ ์‚ฌ์šฉ: {e}")
# Fallback to original method
data_path = benchmark_config.data_path
if os.path.exists(data_path):
with open(data_path, 'r') as f:
for line in f:
try:
data = json.loads(line.strip())
if 'task_id' in data:
problems.append(data['task_id'])
except:
continue
return problems
def get_completed_problems(output_dir: str) -> set:
"""์™„๋ฃŒ๋œ ๋ฌธ์ œ ID ๋ชฉ๋ก ๋กœ๋“œ (resume ๊ธฐ๋Šฅ์šฉ)"""
completed = set()
# ๊ธฐ์กด JSON ๊ฒฐ๊ณผ ํŒŒ์ผ์—์„œ ์™„๋ฃŒ๋œ ๋ฌธ์ œ๋“ค ์ถ”์ถœ
json_file = os.path.join(output_dir, "batch_evaluation_results.json")
if os.path.exists(json_file):
try:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
for result in data.get('problem_results', []):
problem_id = result.get('problem_id')
if problem_id:
completed.add(problem_id)
except Exception as e:
print(f"โš ๏ธ Warning: Could not load existing results: {e}")
return completed
def save_initial_solution_only(result, output_dir, timestamp, problem_id):
"""LLM Generation ์„ฑ๊ณต์‹œ initial_solution๋งŒ ์ €์žฅ"""
# ๋ฒค์น˜๋งˆํฌ์™€ ๋ฌธ์ œ ID์— ๋”ฐ๋ฅธ ๋””๋ ‰ํ† ๋ฆฌ ๊ตฌ์กฐ ์ƒ์„ฑ
benchmark = result.get('benchmark', 'unknown')
problem_id_safe = problem_id.replace('/', '_')
# {output_dir}/{benchmark}/{problem_id} ๊ตฌ์กฐ๋กœ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
base_dir = os.path.join(output_dir, benchmark, problem_id_safe)
os.makedirs(base_dir, exist_ok=True)
# initial_solution ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
initial_solution_dir = os.path.join(base_dir, 'initial_solution')
os.makedirs(initial_solution_dir, exist_ok=True)
# LLM Generation ๋‹จ๊ณ„๊ฐ€ ์žˆ๋Š”์ง€ ํ™•์ธ
if 'steps' in result and 'llm_generation' in result['steps']:
llm_step = result['steps']['llm_generation']
# ๋ฒค์น˜๋งˆํฌ ๋ฌธ์ œ ์›๋ณธ ์ €์žฅ
if 'problem_loading' in result['steps']:
problem_data = result['steps']['problem_loading'].get('problem', {})
problem_file = os.path.join(initial_solution_dir, f"{problem_id_safe}_original_problem.txt")
with open(problem_file, 'w', encoding='utf-8') as f:
f.write(f"Problem ID: {problem_id}\n")
f.write(f"Benchmark: {benchmark}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n")
f.write("ORIGINAL BENCHMARK PROBLEM:\n")
f.write("="*80 + "\n")
f.write(problem_data.get('prompt', 'No prompt available'))
f.write("\n" + "="*80 + "\n")
f.write("FULL LLM PROMPT:\n")
f.write("="*80 + "\n")
# solution_generator.py์—์„œ ์‚ฌ์šฉํ•˜๋Š” ์ „์ฒด ํ”„๋กฌํ”„ํŠธ ์žฌํ˜„
problem_prompt = problem_data.get('prompt', '')
# HumanEval์— ๋Œ€ํ•ด์„œ๋Š” ํ•จ์ˆ˜ ์™„์„ฑ ์š”์ฒญ
if 'HumanEval' in problem_id:
full_prompt = f"""You are a Python writing assistant. Complete the following Python function.
{problem_prompt}
Please provide a complete implementation of the function."""
else:
# MBPP์™€ ๋‹ค๋ฅธ ๋ฒค์น˜๋งˆํฌ์—๋Š” ๊ธฐ์กด ํ”„๋กฌํ”„ํŠธ ์‚ฌ์šฉ
full_prompt = f"""
Please generate a complete, self-contained Python script that solves the following problem.
- Wrap the entire script in a Markdown code block with syntax highlighting (```python ... ```).
- For each function, include a concise docstring enclosed in triple single quotes (''' ... '''), placed immediately below the def line.
The docstring should briefly describe:
โ€ข The function's purpose
โ€ข Input parameters
โ€ข Return value
Problem statement:
{problem_prompt}
"""
f.write(full_prompt.strip())
f.write("\n" + "="*80 + "\n")
f.write("ENTRY POINT:\n")
f.write("="*80 + "\n")
f.write(problem_data.get('entry_point', 'No entry point'))
if 'canonical_solution' in problem_data:
f.write("\n" + "="*80 + "\n")
f.write("CANONICAL SOLUTION:\n")
f.write("="*80 + "\n")
f.write(problem_data.get('canonical_solution', ''))
# LLM ์ƒ์„ฑ ์†”๋ฃจ์…˜ ์ €์žฅ
llm_solution_file = os.path.join(initial_solution_dir, f"{problem_id_safe}_llm_solution.txt")
with open(llm_solution_file, 'w', encoding='utf-8') as f:
f.write(f"Problem ID: {problem_id}\n")
f.write(f"Benchmark: {benchmark}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n")
f.write("LLM GENERATED SOLUTION:\n")
f.write("="*80 + "\n")
f.write(llm_step.get('solution', 'No solution generated'))
f.write("\n" + "="*80 + "\n")
f.write("SYNTAX VALIDATION:\n")
f.write("="*80 + "\n")
syntax_valid = llm_step.get('syntax_valid', False)
f.write(f"Valid: {'โœ… YES' if syntax_valid else 'โŒ NO'}")
if llm_step.get('syntax_error'):
f.write(f"\nError: {llm_step['syntax_error']}")
# ์ดˆ๊ธฐ ์†”๋ฃจ์…˜ ์ •ํ™•์„ฑ ํ‰๊ฐ€ ๊ฒฐ๊ณผ ์ถ”๊ฐ€
f.write("\n" + "="*80 + "\n")
f.write("SOLUTION CORRECTNESS EVALUATION:\n")
f.write("="*80 + "\n")
solution_eval = llm_step.get('solution_evaluation')
if solution_eval:
if solution_eval['correct']:
f.write(f"Result: โœ… CORRECT ({solution_eval['passed_tests']}/{solution_eval['total_tests']} tests passed)\n")
else:
f.write(f"Result: โŒ INCORRECT ({solution_eval['passed_tests']}/{solution_eval['total_tests']} tests passed)\n")
if solution_eval.get('error'):
f.write(f"Error: {solution_eval['error']}\n")
else:
f.write("No evaluation performed (syntax error or evaluation failed)\n")
def save_current_evaluation_details(result, base_dir, timestamp):
"""ํ˜„์žฌ ์„ฑ๋Šฅ ํ‰๊ฐ€ ์ƒ์„ธ ์ •๋ณด ์ €์žฅ - ๊ฐ ์‹œ๋„๋ณ„ ๊ฐœ๋ณ„ ํŒŒ์ผ ์ƒ์„ฑ"""
if 'baseline_evaluation' in result['steps']:
baseline_step = result['steps']['baseline_evaluation']
# current_evaluation ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
current_dir = os.path.join(base_dir, 'current_evaluation')
os.makedirs(current_dir, exist_ok=True)
# ์›๋ณธ ๋ฌธ์ œ ์ •๋ณด ๊ฐ€์ ธ์˜ค๊ธฐ
problem_data = result['steps'].get('problem_loading', {}).get('problem', {})
problem_id = result['problem_id']
benchmark = result.get('benchmark', 'unknown')
# ๊ฐ ๋ผ์šด๋“œ๋ณ„ ๊ฐœ๋ณ„ ํŒŒ์ผ ์ƒ์„ฑ
solutions = baseline_step.get('solutions', [])
for solution_result in solutions:
round_id = solution_result.get('round_id', 0)
attempt_file = os.path.join(current_dir, f'attempt_{round_id + 1}.txt')
with open(attempt_file, 'w', encoding='utf-8') as f:
f.write(f"Current Evaluation - Attempt {round_id + 1}\n")
f.write(f"Problem ID: {problem_id}\n")
f.write(f"Benchmark: {benchmark}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n\n")
# 1. ์›๋ณธ ๋ฌธ์ œ
f.write("1. ORIGINAL PROBLEM:\n")
f.write("="*80 + "\n")
f.write(problem_data.get('prompt', 'No prompt available'))
f.write("\n" + "="*80 + "\n\n")
# 2. LLM์— ๋“ค์–ด๊ฐ€๋Š” ์Šคํฌ๋ฆฝํŠธ (ํ”„๋กฌํ”„ํŠธ)
f.write("2. LLM INPUT SCRIPT (PROMPT):\n")
f.write("="*80 + "\n")
problem_prompt = problem_data.get('prompt', '')
# ์ค‘์•™ ํ”„๋กฌํ”„ํŠธ ์‹œ์Šคํ…œ ์‚ฌ์šฉ
if 'HumanEval' in problem_id:
full_prompt = get_prompt("solution_humaneval_basic",
problem_prompt=problem_prompt)
else:
full_prompt = get_prompt("solution_mbpp_basic",
problem_prompt=problem_prompt)
f.write(full_prompt.strip())
f.write("\n" + "="*80 + "\n\n")
# 3. LLM์˜ ์‘๋‹ต
f.write("3. LLM RESPONSE:\n")
f.write("="*80 + "\n")
f.write(solution_result.get('solution', 'No solution generated'))
f.write("\n" + "="*80 + "\n\n")
# 4. ์ •๋‹ต ์—ฌ๋ถ€
f.write("4. CORRECTNESS EVALUATION:\n")
f.write("="*80 + "\n")
# ๊ตฌ๋ฌธ ๊ฒ€์ฆ
f.write(f"Syntax Valid: {'โœ… YES' if solution_result.get('syntax_valid', False) else 'โŒ NO'}\n")
if solution_result.get('syntax_error'):
f.write(f"Syntax Error: {solution_result['syntax_error']}\n")
# ์ •ํ™•์„ฑ ํ‰๊ฐ€
evaluation = solution_result.get('evaluation')
if evaluation:
if evaluation.get('correct', False):
f.write(f"Result: โœ… CORRECT ({evaluation.get('passed_tests', 0)}/{evaluation.get('total_tests', 0)} tests passed)\n")
else:
f.write(f"Result: โŒ INCORRECT ({evaluation.get('passed_tests', 0)}/{evaluation.get('total_tests', 0)} tests passed)\n")
if evaluation.get('error'):
f.write(f"Evaluation Error: {evaluation['error']}\n")
else:
f.write("Result: โŒ NO EVALUATION (syntax error or evaluation failed)\n")
f.write("="*80 + "\n")
# ์š”์•ฝ ํŒŒ์ผ๋„ ์ƒ์„ฑ (์ „์ฒด ํ†ต๊ณ„)
summary_file = os.path.join(current_dir, 'summary.txt')
with open(summary_file, 'w', encoding='utf-8') as f:
f.write(f"Current Evaluation Summary\n")
f.write(f"Problem ID: {result['problem_id']}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n\n")
# ์ „์ฒด ํ†ต๊ณ„
f.write("OVERALL STATISTICS:\n")
f.write("="*80 + "\n")
f.write(f"Total Attempts: {baseline_step.get('total_rounds', 0)}\n")
f.write(f"Successful Attempts: {baseline_step.get('success_count', 0)}\n")
f.write(f"Success Rate: {baseline_step.get('average_accuracy', 0.0):.3f}\n")
f.write(f"Evaluation Status: {'โœ… SUCCESS' if baseline_step.get('success', False) else 'โŒ FAILED'}\n")
if baseline_step.get('error'):
f.write(f"Error: {baseline_step['error']}\n")
f.write("\n")
f.write("Individual attempt files: attempt_1.txt, attempt_2.txt, attempt_3.txt, attempt_4.txt, attempt_5.txt\n")
def save_diverse_programs_details(result, base_dir, timestamp):
"""๋‹ค์–‘ํ•œ ํ”„๋กœ๊ทธ๋žจ ์ƒ์„ฑ ์ƒ์„ธ ์ •๋ณด ์ €์žฅ"""
if 'diverse_programs' in result['steps']:
diverse_step = result['steps']['diverse_programs']
# diverse_programs ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
diverse_dir = os.path.join(base_dir, 'diverse_programs')
os.makedirs(diverse_dir, exist_ok=True)
# ์š”์•ฝ ํŒŒ์ผ ์ €์žฅ
summary_file = os.path.join(diverse_dir, 'diverse_summary.txt')
with open(summary_file, 'w', encoding='utf-8') as f:
f.write(f"Diverse Programs Generation\n")
f.write(f"Problem ID: {result['problem_id']}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n\n")
# ์ „์ฒด ํ†ต๊ณ„
f.write("DIVERSE PROGRAMS STATISTICS:\n")
f.write("="*80 + "\n")
f.write(f"Total Programs: {diverse_step.get('total_programs', 0)}\n")
f.write(f"Valid Programs: {diverse_step.get('valid_programs', 0)}\n")
f.write(f"Total IPO Triples: {diverse_step.get('total_ipo_triples', 0)}\n")
f.write(f"Generation Status: {'โœ… SUCCESS' if diverse_step.get('success', False) else 'โŒ FAILED'}\n")
if diverse_step.get('error'):
f.write(f"Error: {diverse_step['error']}\n")
f.write("\n\n")
# ๊ฐ ํ”„๋กœ๊ทธ๋žจ๋ณ„ ์ƒ์„ธ ๊ฒฐ๊ณผ
f.write("PROGRAM-BY-PROGRAM RESULTS:\n")
f.write("="*80 + "\n")
programs = diverse_step.get('programs', [])
for program_result in programs:
variation_id = program_result.get('variation_id', 0)
f.write(f"\nProgram {variation_id + 1}:\n")
f.write(f" Syntax Valid: {'โœ…' if program_result.get('syntax_valid', False) else 'โŒ'}\n")
if program_result.get('syntax_error'):
f.write(f" Syntax Error: {program_result['syntax_error']}\n")
f.write(f" IPO Triples: {program_result.get('num_ipo_triples', 0)}\n")
f.write(f" Generated Inputs: {program_result.get('num_generated_inputs', 0)}\n")
# ๊ฐ ํ”„๋กœ๊ทธ๋žจ๋ณ„ ์†”๋ฃจ์…˜ ๋ฐ IPO ์ €์žฅ
programs = diverse_step.get('programs', [])
for program_result in programs:
variation_id = program_result.get('variation_id', 0)
# ํ”„๋กœ๊ทธ๋žจ๋ณ„ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
program_dir = os.path.join(diverse_dir, f'program_{variation_id + 1}')
os.makedirs(program_dir, exist_ok=True)
# ์™„์ „ํ•œ ์ƒ์„ธ ์ •๋ณด ์ €์žฅ (ํ”„๋กฌํ”„ํŠธ + ์†”๋ฃจ์…˜)
detail_file = os.path.join(program_dir, 'generation_details.txt')
with open(detail_file, 'w', encoding='utf-8') as f:
f.write(f"Diverse Program {variation_id + 1} - Generation Details\n")
f.write(f"Problem ID: {result['problem_id']}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n\n")
# 1. ์›๋ณธ ๋ฌธ์ œ
problem_data = result['steps'].get('problem_loading', {}).get('problem', {})
f.write("1. ORIGINAL PROBLEM:\n")
f.write("="*80 + "\n")
f.write(problem_data.get('prompt', 'No prompt available'))
f.write("\n" + "="*80 + "\n\n")
# 2. ๋‹ค์–‘์„ฑ ํ”„๋กฌํ”„ํŠธ (LLM ์ž…๋ ฅ)
f.write("2. DIVERSITY PROMPT USED:\n")
f.write("="*80 + "\n")
# ์ค‘์•™ ํ”„๋กฌํ”„ํŠธ ์‹œ์Šคํ…œ ์‚ฌ์šฉ
diversity_instruction = get_diversity_instruction(variation_id)
problem_prompt = problem_data.get('prompt', '')
problem_id = result['problem_id']
# HumanEval vs MBPP์— ๋”ฐ๋ฅธ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
if 'HumanEval' in problem_id:
full_prompt = get_prompt("diverse_humaneval_basic",
diversity_instruction=diversity_instruction,
problem_prompt=problem_prompt)
else:
full_prompt = get_prompt("diverse_mbpp_basic",
diversity_instruction=diversity_instruction,
problem_prompt=problem_prompt)
f.write(full_prompt.strip())
f.write("\n" + "="*80 + "\n\n")
# 3. LLM ์‘๋‹ต
f.write("3. LLM RESPONSE:\n")
f.write("="*80 + "\n")
f.write(program_result.get('solution', 'No solution generated'))
f.write("\n" + "="*80 + "\n\n")
# 4. ํ‰๊ฐ€ ๊ฒฐ๊ณผ
f.write("4. EVALUATION RESULTS:\n")
f.write("="*80 + "\n")
f.write(f"Syntax Valid: {'โœ… YES' if program_result.get('syntax_valid', False) else 'โŒ NO'}\n")
if program_result.get('syntax_error'):
f.write(f"Syntax Error: {program_result['syntax_error']}\n")
f.write(f"IPO Triples Generated: {program_result.get('num_ipo_triples', 0)}\n")
f.write(f"Input Generation: {program_result.get('num_generated_inputs', 0)} new inputs\n")
f.write("="*80 + "\n")
# ์†”๋ฃจ์…˜๋งŒ ๋”ฐ๋กœ ์ €์žฅ (๊ธฐ์กด ํ˜ธํ™˜์„ฑ)
solution_file = os.path.join(program_dir, 'solution.py')
with open(solution_file, 'w', encoding='utf-8') as f:
f.write(f"# Diverse Program {variation_id + 1}\n")
f.write(f"# Problem ID: {result['problem_id']}\n")
f.write(f"# Generated: {timestamp}\n")
f.write(f"# Syntax Valid: {program_result.get('syntax_valid', False)}\n")
f.write(f"# IPO Triples: {program_result.get('num_ipo_triples', 0)}\n")
f.write("\n")
f.write(program_result.get('solution', '# No solution available'))
# IPO triples ์ €์žฅ
ipo_triples = program_result.get('ipo_triples', [])
if ipo_triples:
ipo_dir = os.path.join(program_dir, 'ipo_triples')
os.makedirs(ipo_dir, exist_ok=True)
for i, triple in enumerate(ipo_triples):
triple_file = os.path.join(ipo_dir, f'triple_{i + 1}.json')
with open(triple_file, 'w', encoding='utf-8') as f:
json.dump(triple, f, indent=2, ensure_ascii=False)
# Input generation ์ •๋ณด ์ €์žฅ (์ƒˆ๋กœ์šด ๊ตฌ์กฐ)
input_gen_info = program_result.get('input_generation_info')
if input_gen_info is not None:
input_gen_file = os.path.join(program_dir, 'input_generation_details.txt')
with open(input_gen_file, 'w', encoding='utf-8') as f:
f.write(f"Input Generation Details - Program {variation_id + 1}\n")
f.write(f"Problem ID: {result['problem_id']}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n\n")
f.write("1. FUNCTION INFO:\n")
f.write("="*80 + "\n")
func_info = input_gen_info.get('function_info', {})
f.write(f"Function Name: {func_info.get('name', 'N/A')}\n")
f.write(f"Parameters: {func_info.get('params', 'N/A')}\n")
f.write(f"Parameters String: {func_info.get('params_str', 'N/A')}\n\n")
f.write("2. ARGUMENT TYPE INFO:\n")
f.write("="*80 + "\n")
f.write(input_gen_info.get('arg_type_info', 'N/A') + "\n\n")
f.write("3. EXISTING EXAMPLES:\n")
f.write("="*80 + "\n")
for i, (inp, out) in enumerate(input_gen_info.get('existing_examples', [])):
f.write(f"Example {i+1}: Input: {inp} โ†’ Output: {out}\n")
f.write("\n")
f.write("4. LLM PROMPT:\n")
f.write("="*80 + "\n")
f.write(input_gen_info.get('prompt', 'N/A') + "\n")
f.write("="*80 + "\n\n")
f.write("5. LLM RESPONSE:\n")
f.write("="*80 + "\n")
f.write(input_gen_info.get('llm_response', 'N/A') + "\n")
f.write("="*80 + "\n\n")
f.write("6. EXTRACTED INPUTS:\n")
f.write("="*80 + "\n")
extracted = input_gen_info.get('extracted_inputs', [])
if extracted:
for i, inp_data in enumerate(extracted):
f.write(f"Input {i+1}: {inp_data}\n")
else:
f.write("No inputs extracted\n")
# ์—๋Ÿฌ๊ฐ€ ์žˆ์—ˆ๋‹ค๋ฉด ํ‘œ์‹œ
if 'error' in input_gen_info:
f.write("\n7. ERROR:\n")
f.write("="*80 + "\n")
f.write(input_gen_info['error'] + "\n")
def save_input_generation_details(result, base_dir, timestamp):
"""์ž…๋ ฅ ์ƒ์„ฑ ๊ด€๋ จ ์ƒ์„ธ ์ •๋ณด ์ €์žฅ"""
if 'ipo_extraction' in result['steps']:
ipo_step = result['steps']['ipo_extraction']
num_generated = ipo_step.get('num_generated', 0)
generated_inputs = ipo_step.get('generated_inputs', [])
generation_prompt = ipo_step.get('generation_prompt', '')
input_generation_attempted = bool(generation_prompt) or len(generated_inputs) > 0
# Input generation ๋‹จ๊ณ„๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ ํ•ญ์ƒ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ (์‹คํŒจํ•œ ๊ฒฝ์šฐ์—๋„ ๋””๋ฒ„๊น…์„ ์œ„ํ•ด)
if 'ipo_extraction' in result['steps']:
# input_generation ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
input_gen_dir = os.path.join(base_dir, 'input_generation')
os.makedirs(input_gen_dir, exist_ok=True)
# ํŒŒ์ผ ์ €์žฅ
details_file = os.path.join(input_gen_dir, 'generation_details.txt')
with open(details_file, 'w', encoding='utf-8') as f:
f.write(f"Input Generation Details\n")
f.write(f"Problem ID: {result['problem_id']}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n\n")
# ํ†ต๊ณ„ ์ •๋ณด
f.write("GENERATION STATISTICS:\n")
f.write("="*80 + "\n")
f.write(f"Original IPO triples: {ipo_step.get('num_original', 0)}\n")
f.write(f"Generated inputs: {ipo_step.get('num_generated', 0)}\n")
f.write(f"Total IPO triples: {ipo_step.get('num_triples', 0)}\n")
f.write(f"Input generation attempted: {input_generation_attempted}\n")
# ์‹คํŒจ ์›์ธ ๋ถ„์„
if not input_generation_attempted:
f.write(f"FAILURE REASON: Input generation was not attempted\n")
elif num_generated == 0:
f.write(f"FAILURE REASON: LLM response could not be parsed or contained no valid inputs\n")
# LLM ํ”„๋กฌํ”„ํŠธ
f.write("\n\n" + "="*80 + "\n")
f.write("LLM INPUT GENERATION PROMPT:\n")
f.write("="*80 + "\n")
f.write(ipo_step.get('generation_prompt', 'No prompt available'))
# LLM ์‘๋‹ต
f.write("\n\n" + "="*80 + "\n")
f.write("LLM RESPONSE:\n")
f.write("="*80 + "\n")
f.write(ipo_step.get('generation_response', 'No response available'))
# ์ถ”์ถœ๋œ ์ž…๋ ฅ๋“ค
f.write("\n\n" + "="*80 + "\n")
f.write("EXTRACTED AND VALIDATED INPUTS:\n")
f.write("="*80 + "\n")
generated_inputs = ipo_step.get('generated_inputs', [])
if generated_inputs:
for i, inp in enumerate(generated_inputs):
f.write(f"\nInput {i+1}:\n")
f.write(f"{inp}\n")
else:
f.write("No valid inputs were extracted.\n")
def save_detailed_results(result, output_dir, timestamp):
"""์ƒ์„ธํ•œ ๊ฒฐ๊ณผ๋ฅผ ๊ฐœ๋ณ„ ํŒŒ์ผ๋กœ ์ €์žฅ (test_complete_pipeline.py ์Šคํƒ€์ผ)"""
# ๋ฒค์น˜๋งˆํฌ์™€ ๋ฌธ์ œ ID์— ๋”ฐ๋ฅธ ๋””๋ ‰ํ† ๋ฆฌ ๊ตฌ์กฐ ์ƒ์„ฑ
benchmark = result.get('benchmark', 'unknown')
problem_id = result['problem_id']
problem_id_safe = problem_id.replace('/', '_')
# {output_dir}/{benchmark}/{problem_id} ๊ตฌ์กฐ๋กœ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
base_dir = os.path.join(output_dir, benchmark, problem_id_safe)
os.makedirs(base_dir, exist_ok=True)
# 1. ์ดˆ๊ธฐ LLM ์†”๋ฃจ์…˜ ์ €์žฅ
if 'llm_generation' in result['steps']:
llm_step = result['steps']['llm_generation']
initial_solution_dir = os.path.join(base_dir, 'initial_solution')
os.makedirs(initial_solution_dir, exist_ok=True)
# ๋ฒค์น˜๋งˆํฌ ๋ฌธ์ œ ์›๋ณธ ์ €์žฅ
if 'problem_loading' in result['steps']:
problem_data = result['steps']['problem_loading'].get('problem', {})
problem_file = os.path.join(initial_solution_dir, f"{problem_id_safe}_original_problem.txt")
with open(problem_file, 'w', encoding='utf-8') as f:
f.write(f"Problem ID: {result['problem_id']}\n")
f.write(f"Benchmark: {result['benchmark']}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n")
f.write("ORIGINAL BENCHMARK PROBLEM:\n")
f.write("="*80 + "\n")
f.write(problem_data.get('prompt', 'No prompt available'))
f.write("\n" + "="*80 + "\n")
f.write("FULL LLM PROMPT:\n")
f.write("="*80 + "\n")
# solution_generator.py์—์„œ ์‚ฌ์šฉํ•˜๋Š” ์ „์ฒด ํ”„๋กฌํ”„ํŠธ ์žฌํ˜„
problem_prompt = problem_data.get('prompt', '')
# HumanEval์— ๋Œ€ํ•ด์„œ๋Š” ํ•จ์ˆ˜ ์™„์„ฑ ์š”์ฒญ
if 'HumanEval' in problem_id:
full_prompt = f"""You are a Python writing assistant. Complete the following Python function.
{problem_prompt}
Please provide a complete implementation of the function."""
else:
# MBPP์™€ ๋‹ค๋ฅธ ๋ฒค์น˜๋งˆํฌ์—๋Š” ๊ธฐ์กด ํ”„๋กฌํ”„ํŠธ ์‚ฌ์šฉ
full_prompt = f"""
Please generate a complete, self-contained Python script that solves the following problem.
- Wrap the entire script in a Markdown code block with syntax highlighting (```python ... ```).
- For each function, include a concise docstring enclosed in triple single quotes (''' ... '''), placed immediately below the def line.
The docstring should briefly describe:
โ€ข The function's purpose
โ€ข Input parameters
โ€ข Return value
Problem statement:
{problem_prompt}
"""
f.write(full_prompt.strip())
f.write("\n" + "="*80 + "\n")
f.write("ENTRY POINT:\n")
f.write("="*80 + "\n")
f.write(problem_data.get('entry_point', 'No entry point'))
if 'canonical_solution' in problem_data:
f.write("\n" + "="*80 + "\n")
f.write("CANONICAL SOLUTION:\n")
f.write("="*80 + "\n")
f.write(problem_data.get('canonical_solution', ''))
if 'test' in problem_data:
f.write("\n" + "="*80 + "\n")
f.write("TEST CASES:\n")
f.write("="*80 + "\n")
f.write(str(problem_data.get('test', '')))
# LLM ์ƒ์„ฑ ์†”๋ฃจ์…˜ ์ €์žฅ
llm_solution_file = os.path.join(initial_solution_dir, f"{problem_id_safe}_llm_solution.txt")
with open(llm_solution_file, 'w', encoding='utf-8') as f:
f.write(f"Problem ID: {result['problem_id']}\n")
f.write(f"Benchmark: {result['benchmark']}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n")
f.write("LLM GENERATED SOLUTION:\n")
f.write("="*80 + "\n")
f.write(llm_step.get('solution', 'No solution generated'))
f.write("\n" + "="*80 + "\n")
f.write("SYNTAX VALIDATION:\n")
f.write("="*80 + "\n")
syntax_valid = llm_step.get('syntax_valid', False)
f.write(f"Valid: {'โœ… YES' if syntax_valid else 'โŒ NO'}")
if llm_step.get('syntax_error'):
f.write(f"\nError: {llm_step['syntax_error']}")
# ์ดˆ๊ธฐ ์†”๋ฃจ์…˜ ์ •ํ™•์„ฑ ํ‰๊ฐ€ ๊ฒฐ๊ณผ ์ถ”๊ฐ€
f.write("\n" + "="*80 + "\n")
f.write("SOLUTION CORRECTNESS EVALUATION:\n")
f.write("="*80 + "\n")
solution_eval = llm_step.get('solution_evaluation')
if solution_eval:
if solution_eval['correct']:
f.write(f"Result: โœ… CORRECT ({solution_eval['passed_tests']}/{solution_eval['total_tests']} tests passed)\n")
else:
f.write(f"Result: โŒ INCORRECT ({solution_eval['passed_tests']}/{solution_eval['total_tests']} tests passed)\n")
if solution_eval.get('error'):
f.write(f"Error: {solution_eval['error']}\n")
else:
f.write("No evaluation performed (syntax error or no test cases)\n")
# 2. IPO ํŠธ๋ฆฌํ”Œ ์ €์žฅ
if 'ipo_extraction' in result['steps']:
ipo_step = result['steps']['ipo_extraction']
triples = ipo_step.get('triples', [])
if triples:
ipo_dir = os.path.join(base_dir, 'ipo_triples')
os.makedirs(ipo_dir, exist_ok=True)
for i, triple in enumerate(triples):
triple_file = os.path.join(ipo_dir, f"{problem_id_safe}_triple_{i+1}.json")
with open(triple_file, 'w', encoding='utf-8') as f:
json.dump(triple, f, indent=2, ensure_ascii=False)
# 3. ์ƒ์„ฑ๋œ ํƒœ์Šคํฌ ํ”„๋กฌํ”„ํŠธ ์ €์žฅ
if 'task_generation' in result['steps']:
task_step = result['steps']['task_generation']
all_tasks = task_step.get('all_tasks', {})
if all_tasks:
task_dir = os.path.join(base_dir, 'task_prompts')
os.makedirs(task_dir, exist_ok=True)
for task_type, tasks in all_tasks.items():
for i, task in enumerate(tasks):
task_file = os.path.join(task_dir, f"{problem_id_safe}_{task_type}_{i+1}.txt")
with open(task_file, 'w', encoding='utf-8') as f:
f.write(f"Task Type: {task_type}\n")
f.write(f"Task ID: {task.get('task_id', 'N/A')}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n")
f.write("TASK PROMPT:\n")
f.write("="*80 + "\n")
f.write(task.get('prompt', 'No prompt available'))
# 4. LLM ํƒœ์Šคํฌ ์‘๋‹ต ์ €์žฅ
if 'task_evaluation' in result['steps']:
eval_step = result['steps']['task_evaluation']
evaluations = eval_step.get('evaluations', {})
response_dir = os.path.join(base_dir, 'llm_responses')
os.makedirs(response_dir, exist_ok=True)
response_count = 0
for task_type, task_evals in evaluations.items():
for i, evaluation in enumerate(task_evals):
response_file = os.path.join(response_dir, f"{problem_id_safe}_{task_type}_{i+1}_response.txt")
with open(response_file, 'w', encoding='utf-8') as f:
f.write(f"Task Type: {task_type}\n")
f.write(f"Task ID: {evaluation.get('task_id', 'N/A')}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n")
f.write("ORIGINAL PROMPT:\n")
f.write("="*80 + "\n")
f.write(evaluation.get('prompt', 'No prompt available'))
f.write("\n" + "="*80 + "\n")
f.write("LLM RESPONSE:\n")
f.write("="*80 + "\n")
f.write(evaluation.get('llm_response', 'No response'))
f.write("\n" + "="*80 + "\n")
f.write("EXPECTED SOLUTION:\n")
f.write("="*80 + "\n")
f.write(evaluation.get('expected_solution', 'No expected solution'))
# ์ถ”์ถœ๋œ ์ •๋‹ต ์ •๋ณด ์ถ”๊ฐ€ (๋ณด์ƒ ๊ณ„์‚ฐ ๊ฒฐ๊ณผ์—์„œ ๊ฐ€์ ธ์˜ค๊ธฐ)
if 'reward_computation' in result['steps']:
reward_step = result['steps']['reward_computation']
rewards = reward_step.get('rewards', {})
rewards_by_type = rewards.get('rewards_by_type', {})
# ํ˜„์žฌ ํƒœ์Šคํฌ์˜ ๋ณด์ƒ ์ •๋ณด ์ฐพ๊ธฐ
current_task_rewards = rewards_by_type.get(task_type, [])
current_reward = None
for reward in current_task_rewards:
if reward.get('task_id') == evaluation.get('task_id'):
current_reward = reward
break
if current_reward and 'extracted_answer' in current_reward:
f.write("\n" + "="*80 + "\n")
f.write("EXTRACTED ANSWER:\n")
f.write("="*80 + "\n")
f.write(current_reward['extracted_answer'])
f.write("\n" + "="*80 + "\n")
f.write("MATCH RESULT:\n")
f.write("="*80 + "\n")
match_result = "โœ… CORRECT" if current_reward.get('basic_accuracy', 0) > 0 else "โŒ INCORRECT"
f.write(f"{match_result} (Score: {current_reward.get('basic_accuracy', 0):.3f})")
response_count += 1
print(f"๐Ÿ“ LLM ์‘๋‹ต ์ €์žฅ: {response_dir}/ ({response_count}๊ฐœ ํŒŒ์ผ)")
# 4.5. ์ž…๋ ฅ ์ƒ์„ฑ ์ƒ์„ธ ์ •๋ณด ์ €์žฅ
save_input_generation_details(result, base_dir, timestamp)
# 5. ์ „์ฒด ๊ฒฐ๊ณผ ์š”์•ฝ ์ €์žฅ
summary_file = os.path.join(base_dir, f"{problem_id_safe}_summary.json")
with open(summary_file, 'w', encoding='utf-8') as f:
summary = {
'problem_id': result['problem_id'],
'benchmark': result['benchmark'],
'success': result['success'],
'timestamp': timestamp,
'initial_solution_correct': False,
'ipo_extraction_success': False,
'reasoning_task_results': {}
}
# ์ดˆ๊ธฐ ์†”๋ฃจ์…˜ ๊ฒฐ๊ณผ
if 'llm_generation' in result['steps']:
llm_step = result['steps']['llm_generation']
eval_result = llm_step.get('solution_evaluation')
if eval_result:
summary['initial_solution_correct'] = eval_result['correct']
# IPO ์ถ”์ถœ ๊ฒฐ๊ณผ
if 'ipo_extraction' in result['steps']:
ipo_step = result['steps']['ipo_extraction']
summary['ipo_extraction_success'] = ipo_step.get('success', False)
# Reasoning task ๊ฒฐ๊ณผ
if 'reward_computation' in result['steps']:
reward_step = result['steps']['reward_computation']
rewards = reward_step.get('rewards', {})
for task_type, type_rewards in rewards.get('rewards_by_type', {}).items():
correct_count = sum(1 for r in type_rewards if r['basic_accuracy'] > 0)
total_count = len(type_rewards)
summary['reasoning_task_results'][task_type] = {
'correct': correct_count,
'total': total_count,
'accuracy': correct_count / total_count if total_count > 0 else 0
}
json.dump(summary, f, indent=2, ensure_ascii=False)
def run_batch_evaluation(args):
"""๋ฒค์น˜๋งˆํฌ ์ „์ฒด์— ๋Œ€ํ•œ ๋ฐฐ์น˜ ํ‰๊ฐ€ ์‹คํ–‰"""
# ํƒ€์ž„์Šคํƒฌํ”„ ์ƒ์„ฑ
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# ๊ฒฐ๊ณผ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
output_dir = os.path.join(args.output_dir, f"batch_evaluation_{timestamp}")
os.makedirs(output_dir, exist_ok=True)
# ๋กœ๊ฑฐ ์„ค์ •
logger = TestTimeLogger(log_level='INFO')
logger.log_info(f"๐Ÿš€ Starting batch TestTime RLVR evaluation")
logger.log_info(f"๐Ÿ“‹ Model: {args.model}")
logger.log_info(f"๐ŸŽฏ Benchmark: {args.benchmark}")
logger.log_info(f"๐Ÿ“Š Max problems: {args.max_problems}")
logger.log_info(f"๐Ÿ“ Output: {output_dir}")
# TestTime ์„ค์ •
config = TestTimeConfig(
model_name=args.model,
max_adaptation_steps=3,
learning_rate=1e-5,
task_distribution={'induction': 0.4, 'deduction': 0.3, 'abduction': 0.3},
adaptation_batch_size=1,
max_tasks_per_type=3,
use_flash_attention=False,
torch_dtype='float16', # VLLM ํ˜ธํ™˜์„ฑ์„ ์œ„ํ•ด float16 ์‚ฌ์šฉ
enable_gradient_checkpointing=False
)
# ๋ฒค์น˜๋งˆํฌ ์„ค์ • (์ ˆ๋Œ€ ๊ฒฝ๋กœ๋กœ ๊ณ„์‚ฐ)
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if args.benchmark == 'humaneval':
benchmark_config = BenchmarkConfig.get_humaneval_config()
benchmark_config.data_path = os.path.join(base_dir, 'evaluation/code_eval/data/HumanEvalPlus.jsonl')
elif args.benchmark == 'mbpp':
benchmark_config = BenchmarkConfig.get_mbpp_config()
benchmark_config.data_path = os.path.join(base_dir, 'evaluation/code_eval/data/MbppPlus.jsonl')
else:
raise ValueError(f"Unsupported benchmark: {args.benchmark}")
# ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ
logger.log_info("๐Ÿ“ฆ Loading model and tokenizer...")
try:
model, tokenizer = InitialSolutionGenerator.load_model_with_optimizations(
args.model, f'cuda:{args.gpu}', config, use_vllm=True
)
logger.log_info("โœ… Model loaded successfully")
except Exception as e:
logger.log_error(f"โŒ Failed to load model: {e}")
return False
# ํŒŒ์ดํ”„๋ผ์ธ ์ดˆ๊ธฐํ™”
pipeline = CompleteTestTimePipeline(model, tokenizer, config, logger)
# ๋ฌธ์ œ ๋ชฉ๋ก ๋กœ๋“œ
logger.log_info("๐Ÿ“„ Loading benchmark problems...")
problems = load_benchmark_problems(benchmark_config)
if not problems:
logger.log_error("โŒ No problems found in benchmark")
return False
# Resume ๊ธฐ๋Šฅ ์ฒ˜๋ฆฌ
original_problem_count = len(problems)
completed_problems = set()
existing_results = None
if args.resume or args.start_from:
# ๊ธฐ์กด ๊ฒฐ๊ณผ ๋กœ๋“œ
completed_problems = get_completed_problems(output_dir)
if completed_problems:
logger.log_info(f"๐Ÿ”„ Resume mode: Found {len(completed_problems)} completed problems")
# ๊ธฐ์กด ๊ฒฐ๊ณผ ๋กœ๋“œ
existing_results_file = os.path.join(output_dir, "batch_evaluation_results.json")
if os.path.exists(existing_results_file):
with open(existing_results_file, 'r', encoding='utf-8') as f:
existing_results = json.load(f)
logger.log_info(f"๐Ÿ“ Loaded existing results from {existing_results_file}")
# ์™„๋ฃŒ๋œ ๋ฌธ์ œ ์ œ์™ธ
problems = [p for p in problems if p not in completed_problems]
logger.log_info(f"๐Ÿ“Š After excluding completed: {len(problems)} problems remaining")
# ํŠน์ • ๋ฌธ์ œ๋ถ€ํ„ฐ ์‹œ์ž‘
if args.start_from:
try:
start_idx = problems.index(args.start_from)
problems = problems[start_idx:]
logger.log_info(f"๐Ÿ Starting from problem: {args.start_from} (index {start_idx})")
except ValueError:
logger.log_warning(f"โš ๏ธ Problem {args.start_from} not found, starting from beginning")
# ๋ฌธ์ œ ์ˆ˜ ์ œํ•œ (๋‚จ์€ ๋ฌธ์ œ์— ๋Œ€ํ•ด์„œ๋งŒ)
if args.max_problems > 0:
problems = problems[:args.max_problems]
if not problems:
logger.log_info("๐ŸŽ‰ All problems already completed!")
return True
logger.log_info(f"๐Ÿ“Š Processing {len(problems)} problems (Total in benchmark: {original_problem_count})")
# ํ‰๊ฐ€ ๊ฒฐ๊ณผ ์ˆ˜์ง‘ (๊ธฐ์กด ๊ฒฐ๊ณผ ๋˜๋Š” ์ƒˆ๋กœ์šด ๊ฒฐ๊ณผ)
if existing_results:
# ๊ธฐ์กด ๊ฒฐ๊ณผ๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์‹œ์ž‘ (ํ†ต๊ณ„๋งŒ ๋‚จ๊ธฐ๊ณ  ์ƒˆ๋กœ์šด ๋ฌธ์ œ๋ฅผ ์œ„ํ•œ ์ดˆ๊ธฐํ™”)
results = {
'config': existing_results['config'].copy(),
'initial_solution_stats': {
**existing_results['initial_solution_stats'].copy(),
'first_attempt_correct': existing_results['initial_solution_stats'].get('first_attempt_correct', 0),
'at_least_once_correct': existing_results['initial_solution_stats'].get('at_least_once_correct', 0),
'total_attempts': existing_results['initial_solution_stats'].get('total_attempts', 0),
'total_successes': existing_results['initial_solution_stats'].get('total_successes', 0),
'first_attempt_failed_problem_ids': existing_results['initial_solution_stats'].get('first_attempt_failed_problem_ids', []),
'never_success_problem_ids': existing_results['initial_solution_stats'].get('never_success_problem_ids', [])
},
'reasoning_task_stats': {
task_type: {
**stats,
'total_accuracy': stats.get('total_accuracy', 0.0) # ๊ธฐ์กด ๊ฒฐ๊ณผ์— ์—†์„ ๊ฒฝ์šฐ ๊ธฐ๋ณธ๊ฐ’
}
for task_type, stats in existing_results['reasoning_task_stats'].items()
},
'ipo_extraction_stats': existing_results['ipo_extraction_stats'].copy(),
'input_generation_stats': existing_results.get('input_generation_stats', {
'total_attempts': 0,
'successful': 0,
'failed': 0,
'total_generated_inputs': 0,
'average_inputs_per_problem': 0.0,
'problems_with_generation': []
}).copy(),
'current_evaluation_stats': existing_results.get('current_evaluation_stats', existing_results.get('baseline_evaluation_stats', {
'total_attempts': 0,
'successful': 0,
'failed': 0,
'total_rounds': 0,
'total_success_rounds': 0,
'average_success_rate': 0.0,
'failed_problem_ids': []
})).copy(),
'diverse_programs_stats': existing_results.get('diverse_programs_stats', {
'total_attempts': 0,
'successful': 0,
'failed': 0,
'total_programs_generated': 0,
'total_valid_programs': 0,
'total_ipo_triples': 0,
'average_programs_per_problem': 0.0,
'average_ipo_per_problem': 0.0,
'failed_problem_ids': []
}).copy(),
'timing_stats': existing_results['timing_stats'].copy(),
'problem_results': existing_results['problem_results'].copy()
}
results['config']['resumed'] = True
results['config']['resumed_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
results['config']['remaining_problems'] = len(problems)
else:
# ์ƒˆ๋กœ์šด ๊ฒฐ๊ณผ ๊ตฌ์กฐ
results = {
'config': {
'model': args.model,
'benchmark': args.benchmark,
'timestamp': timestamp,
'total_problems': original_problem_count,
'processing_problems': len(problems)
},
'initial_solution_stats': {
'total': 0,
'first_attempt_correct': 0, # ์ฒซ ๋ฒˆ์งธ ์‹œ๋„๋งŒ ์ •๋‹ต
'at_least_once_correct': 0, # 5๋ฒˆ ์ค‘ 1๋ฒˆ์ด๋ผ๋„ ์ •๋‹ต
'total_attempts': 0, # ์ „์ฒด ์‹œ๋„ ์ˆ˜
'total_successes': 0, # ์ „์ฒด ์„ฑ๊ณต ์ˆ˜
'first_attempt_failed_problem_ids': [], # ์ฒซ ์‹œ๋„ ์‹คํŒจ ๋ฌธ์ œ๋“ค
'never_success_problem_ids': [], # 5๋ฒˆ ๋ชจ๋‘ ์‹คํŒจ ๋ฌธ์ œ๋“ค
'syntax_errors': 0,
'evaluation_errors': 0,
'correct': 0, # ๊ธฐ์กด ํ˜ธํ™˜์„ฑ ์œ ์ง€ (at_least_once_correct์™€ ๋™์ผ)
'failed_problem_ids': [] # ๊ธฐ์กด ํ˜ธํ™˜์„ฑ ์œ ์ง€
},
'reasoning_task_stats': {
'induction': {
'total': 0,
'correct': 0,
'accuracy_0_count': 0, # accuracy = 0์ธ ๊ฐœ์ˆ˜
'accuracy_1_count': 0, # accuracy = 1์ธ ๊ฐœ์ˆ˜
'total_accuracy': 0.0 # ์ „์ฒด ํ‰๊ท  ์ •ํ™•๋„ ๊ณ„์‚ฐ์šฉ
},
'deduction': {
'total': 0,
'correct': 0,
'accuracy_0_count': 0,
'accuracy_1_count': 0,
'total_accuracy': 0.0
},
'abduction': {
'total': 0,
'correct': 0,
'accuracy_0_count': 0,
'accuracy_1_count': 0,
'total_accuracy': 0.0
}
},
'timing_stats': {
'total_time_seconds': 0,
'average_time_per_problem': 0,
'problem_times': [] # ๊ฐ ๋ฌธ์ œ๋ณ„ ์†Œ์š”์‹œ๊ฐ„
},
'ipo_extraction_stats': {
'total_attempts': 0,
'successful': 0,
'failed': 0,
'failed_problem_ids': [] # IPO ์ถ”์ถœ ์‹คํŒจ ๋ฌธ์ œ ID ๋ชฉ๋ก
},
'input_generation_stats': {
'total_attempts': 0,
'successful': 0,
'failed': 0,
'total_generated_inputs': 0,
'average_inputs_per_problem': 0.0,
'problems_with_generation': []
},
'current_evaluation_stats': {
'total_attempts': 0,
'successful': 0,
'failed': 0,
'total_rounds': 0,
'total_success_rounds': 0,
'average_success_rate': 0.0,
'failed_problem_ids': []
},
'diverse_programs_stats': {
'total_attempts': 0,
'successful': 0,
'failed': 0,
'total_programs_generated': 0,
'total_valid_programs': 0,
'total_ipo_triples': 0,
'average_programs_per_problem': 0.0,
'average_ipo_per_problem': 0.0,
'failed_problem_ids': []
},
'problem_results': []
}
# ๊ฐ ๋ฌธ์ œ์— ๋Œ€ํ•ด ํŒŒ์ดํ”„๋ผ์ธ ์‹คํ–‰
start_total_time = time.time()
for i, problem_id in enumerate(problems):
logger.log_info(f"๐Ÿ”„ [{i+1}/{len(problems)}] Processing {problem_id}")
# ๋ฌธ์ œ๋ณ„ ์‹œ๊ฐ„ ์ธก์ • ์‹œ์ž‘
problem_start_time = time.time()
# ๊ฐ ๋‹จ๊ณ„๋ณ„ ์„ฑ๊ณต/์‹คํŒจ ์ถ”์ 
step_results = {
'problem_loading': False,
'llm_generation': False,
'solution_evaluation': False,
'ipo_extraction': False,
'input_generation': False, # ์ƒˆ๋กœ ์ถ”๊ฐ€
'task_generation': False,
'task_evaluation': False
}
try:
# ํŒŒ์ดํ”„๋ผ์ธ ์‹คํ–‰
result = pipeline.run_complete_pipeline(benchmark_config, problem_id)
# ๋ฌธ์ œ๋ณ„ ์‹œ๊ฐ„ ์ธก์ • ์ข…๋ฃŒ
problem_end_time = time.time()
problem_duration = problem_end_time - problem_start_time
# ๊ฐ ๋‹จ๊ณ„๋ณ„ ์„ฑ๊ณต ์—ฌ๋ถ€ ํ™•์ธ
if 'steps' in result:
step_results['problem_loading'] = result.get('success', False)
# baseline_evaluation์ด ์žˆ์œผ๋ฉด LLM generation๊ณผ solution evaluation์ด ์„ฑ๊ณต
if 'baseline_evaluation' in result['steps']:
baseline_eval = result['steps']['baseline_evaluation']
step_results['llm_generation'] = baseline_eval.get('success', False)
step_results['solution_evaluation'] = baseline_eval.get('success_count', 0) > 0
# diverse_programs๊ฐ€ ์žˆ์œผ๋ฉด IPO extraction์ด ์„ฑ๊ณต
if 'diverse_programs' in result['steps']:
diverse_progs = result['steps']['diverse_programs']
step_results['ipo_extraction'] = diverse_progs.get('total_ipo_triples', 0) > 0
# Input generation ์„ฑ๊ณต ์—ฌ๋ถ€ - diverse_programs์— generated_inputs๊ฐ€ ์žˆ๋Š”์ง€ ํ™•์ธ
if 'diverse_programs' in result['steps']:
diverse_progs = result['steps']['diverse_programs']
total_generated = sum(p.get('num_generated_inputs', 0) for p in diverse_progs.get('programs', []))
step_results['input_generation'] = total_generated > 0
# Task generation๊ณผ evaluation ์„ฑ๊ณต ์—ฌ๋ถ€
if 'task_generation' in result['steps']:
task_gen = result['steps']['task_generation']
step_results['task_generation'] = task_gen.get('total_tasks', 0) > 0
if 'task_evaluation' in result['steps']:
task_eval = result['steps']['task_evaluation']
step_results['task_evaluation'] = task_eval.get('total_evaluated', 0) > 0
# ๋‹จ๊ณ„๋ณ„ ๋กœ๊น…
logger.log_info(f" ๐Ÿ“‹ Problem Loading: {'โœ…' if step_results['problem_loading'] else 'โŒ'}")
logger.log_info(f" ๐Ÿค– LLM Generation: {'โœ…' if step_results['llm_generation'] else 'โŒ'}")
logger.log_info(f" ๐Ÿ“Š Solution Evaluation: {'โœ…' if step_results['solution_evaluation'] else 'โŒ'}")
logger.log_info(f" ๐Ÿ” IPO Extraction: {'โœ…' if step_results['ipo_extraction'] else 'โŒ'}")
logger.log_info(f" ๐ŸŽฒ Input Generation: {'โœ…' if step_results['input_generation'] else 'โŒ'}")
logger.log_info(f" ๐Ÿ“ Task Generation: {'โœ…' if step_results['task_generation'] else 'โŒ'}")
logger.log_info(f" ๐Ÿง  Task Evaluation: {'โœ…' if step_results['task_evaluation'] else 'โŒ'}")
# ์ƒˆ๋กœ์šด ๊ตฌ์กฐ์—์„œ๋Š” initial_solution ์ €์žฅ ๋ถˆํ•„์š” (current_evaluation์œผ๋กœ ๋Œ€์ฒด๋จ)
# if step_results['llm_generation']:
# try:
# save_initial_solution_only(result, output_dir, timestamp, problem_id)
# logger.log_info(f" ๐Ÿ“ Initial solution saved for {problem_id}")
# except Exception as e:
# logger.log_warning(f" โš ๏ธ Failed to save initial solution: {e}")
# ์ „์ฒด ์„ฑ๊ณต์‹œ์—๋งŒ ์™„์ „ํ•œ ๊ฒฐ๊ณผ ์ €์žฅ
if result['success']:
try:
save_detailed_results(result, output_dir, timestamp)
# ์ƒˆ๋กœ์šด ํ˜„์žฌ ํ‰๊ฐ€ ๋ฐ ๋‹ค์–‘ํ•œ ํ”„๋กœ๊ทธ๋žจ ๊ฒฐ๊ณผ ์ €์žฅ
base_dir = os.path.join(output_dir, result.get('benchmark', 'unknown'), problem_id.replace('/', '_'))
save_current_evaluation_details(result, base_dir, timestamp)
save_diverse_programs_details(result, base_dir, timestamp)
logger.log_info(f" ๐Ÿ“ Complete results saved for {problem_id}")
except Exception as e:
logger.log_warning(f" โš ๏ธ Failed to save complete results: {e}")
# ์ดˆ๊ธฐ ์†”๋ฃจ์…˜ ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ
results['initial_solution_stats']['total'] += 1
initial_solution_correct = False
# IPO ์ถ”์ถœ ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ
results['ipo_extraction_stats']['total_attempts'] += 1
if result['success']:
# baseline_evaluation ๊ฒฐ๊ณผ๋กœ ํ†ต๊ณ„ ๊ณ„์‚ฐ (5๋ฒˆ ์‹œ๋„)
baseline_eval = result['steps'].get('baseline_evaluation', {})
attempts = baseline_eval.get('solutions', [])
if attempts:
# ์ „์ฒด ์‹œ๋„ ๋ฐ ์„ฑ๊ณต ์ˆ˜ ๋ˆ„์ 
results['initial_solution_stats']['total_attempts'] += len(attempts)
successes = sum(1 for attempt in attempts if attempt.get('evaluation', {}).get('correct', False))
results['initial_solution_stats']['total_successes'] += successes
# 1. ์ฒซ ๋ฒˆ์งธ ์‹œ๋„ ์ •ํ™•๋„
first_attempt_correct = attempts[0].get('evaluation', {}).get('correct', False)
if first_attempt_correct:
results['initial_solution_stats']['first_attempt_correct'] += 1
else:
# ์ฒซ ์‹œ๋„ ์‹คํŒจ ๋ฌธ์ œ ID ์ถ”๊ฐ€
if problem_id not in results['initial_solution_stats']['first_attempt_failed_problem_ids']:
results['initial_solution_stats']['first_attempt_failed_problem_ids'].append(problem_id)
# 2. 5๋ฒˆ ์ค‘ 1๋ฒˆ์ด๋ผ๋„ ์„ฑ๊ณต
at_least_once_success = any(attempt.get('evaluation', {}).get('correct', False) for attempt in attempts)
if at_least_once_success:
results['initial_solution_stats']['at_least_once_correct'] += 1
results['initial_solution_stats']['correct'] += 1 # ๊ธฐ์กด ํ˜ธํ™˜์„ฑ
initial_solution_correct = True
else:
# 5๋ฒˆ ๋ชจ๋‘ ์‹คํŒจํ•œ ๋ฌธ์ œ ID ์ถ”๊ฐ€
if problem_id not in results['initial_solution_stats']['never_success_problem_ids']:
results['initial_solution_stats']['never_success_problem_ids'].append(problem_id)
if problem_id not in results['initial_solution_stats']['failed_problem_ids']:
results['initial_solution_stats']['failed_problem_ids'].append(problem_id)
# ๊ตฌ๋ฌธ ์˜ค๋ฅ˜ ๋ฐ ํ‰๊ฐ€ ์˜ค๋ฅ˜ ํ™•์ธ (์ฒซ ๋ฒˆ์งธ ์‹œ๋„ ๊ธฐ์ค€)
first_attempt = attempts[0]
if not first_attempt.get('syntax_valid', True):
results['initial_solution_stats']['syntax_errors'] += 1
if first_attempt.get('evaluation_error'):
results['initial_solution_stats']['evaluation_errors'] += 1
else:
# baseline_evaluation์ด ์—†๋Š” ๊ฒฝ์šฐ ๊ธฐ์กด ๋ฐฉ์‹์œผ๋กœ fallback
llm_gen = result['steps'].get('llm_generation', {})
eval_result = llm_gen.get('solution_evaluation')
if eval_result:
if eval_result['correct']:
results['initial_solution_stats']['first_attempt_correct'] += 1
results['initial_solution_stats']['at_least_once_correct'] += 1
results['initial_solution_stats']['correct'] += 1
initial_solution_correct = True
else:
# ์‹คํŒจ ๋ฌธ์ œ ID ์ถ”๊ฐ€
if problem_id not in results['initial_solution_stats']['first_attempt_failed_problem_ids']:
results['initial_solution_stats']['first_attempt_failed_problem_ids'].append(problem_id)
if problem_id not in results['initial_solution_stats']['never_success_problem_ids']:
results['initial_solution_stats']['never_success_problem_ids'].append(problem_id)
if problem_id not in results['initial_solution_stats']['failed_problem_ids']:
results['initial_solution_stats']['failed_problem_ids'].append(problem_id)
if eval_result.get('error'):
results['initial_solution_stats']['evaluation_errors'] += 1
if not llm_gen.get('syntax_valid', True):
results['initial_solution_stats']['syntax_errors'] += 1
# IPO ์ถ”์ถœ ์„ฑ๊ณต ์—ฌ๋ถ€ ํ™•์ธ
ipo_step = result['steps'].get('ipo_extraction', {})
if ipo_step.get('success', False) and ipo_step.get('triples'):
results['ipo_extraction_stats']['successful'] += 1
else:
results['ipo_extraction_stats']['failed'] += 1
if problem_id not in results['ipo_extraction_stats']['failed_problem_ids']:
results['ipo_extraction_stats']['failed_problem_ids'].append(problem_id)
logger.log_info(f" โš ๏ธ IPO extraction failed for {problem_id}")
# Input generation ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ
if ipo_step.get('success', False):
results['input_generation_stats']['total_attempts'] += 1
if ipo_step.get('num_generated', 0) > 0:
results['input_generation_stats']['successful'] += 1
results['input_generation_stats']['total_generated_inputs'] += ipo_step['num_generated']
if problem_id not in results['input_generation_stats']['problems_with_generation']:
results['input_generation_stats']['problems_with_generation'].append(problem_id)
else:
results['input_generation_stats']['failed'] += 1
# Current evaluation ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ
baseline_step = result['steps'].get('baseline_evaluation', {})
if baseline_step:
results['current_evaluation_stats']['total_attempts'] += 1
if baseline_step.get('success', False):
results['current_evaluation_stats']['successful'] += 1
results['current_evaluation_stats']['total_rounds'] += baseline_step.get('total_rounds', 0)
results['current_evaluation_stats']['total_success_rounds'] += baseline_step.get('success_count', 0)
else:
results['current_evaluation_stats']['failed'] += 1
if problem_id not in results['current_evaluation_stats']['failed_problem_ids']:
results['current_evaluation_stats']['failed_problem_ids'].append(problem_id)
# Diverse programs ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ
diverse_step = result['steps'].get('diverse_programs', {})
if diverse_step:
results['diverse_programs_stats']['total_attempts'] += 1
if diverse_step.get('success', False):
results['diverse_programs_stats']['successful'] += 1
results['diverse_programs_stats']['total_programs_generated'] += diverse_step.get('total_programs', 0)
results['diverse_programs_stats']['total_valid_programs'] += diverse_step.get('valid_programs', 0)
results['diverse_programs_stats']['total_ipo_triples'] += diverse_step.get('total_ipo_triples', 0)
else:
results['diverse_programs_stats']['failed'] += 1
if problem_id not in results['diverse_programs_stats']['failed_problem_ids']:
results['diverse_programs_stats']['failed_problem_ids'].append(problem_id)
# Reasoning tasks ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ (๋ฌธ์ œ๋ณ„ ํ‰๊ท  ์ •ํ™•๋„ ๊ธฐ์ค€)
reward_step = result['steps'].get('reward_computation', {})
rewards = reward_step.get('rewards', {})
# ๊ฐ ๋ฌธ์ œ๋ณ„๋กœ task type๋ณ„ ํ‰๊ท  accuracy ๊ณ„์‚ฐ
for task_type, type_rewards in rewards.get('rewards_by_type', {}).items():
if type_rewards: # task๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ์—๋งŒ
results['reasoning_task_stats'][task_type]['total'] += 1
# ์ด ๋ฌธ์ œ์—์„œ ํ•ด๋‹น task type์˜ ํ‰๊ท  accuracy ๊ณ„์‚ฐ
task_accuracies = [reward['basic_accuracy'] for reward in type_rewards]
problem_avg_accuracy = sum(task_accuracies) / len(task_accuracies)
# ์ „์ฒด ํ‰๊ท  ์ •ํ™•๋„์— ๋ˆ„์ 
results['reasoning_task_stats'][task_type]['total_accuracy'] += problem_avg_accuracy
# ๋ฌธ์ œ๋ณ„ ํ‰๊ท ์ด 0๋ณด๋‹ค ํฌ๋ฉด correct๋กœ ์นด์šดํŠธ
if problem_avg_accuracy > 0:
results['reasoning_task_stats'][task_type]['correct'] += 1
# ๋ฌธ์ œ๋ณ„ ํ‰๊ท  accuracy ๋ถ„ํฌ ์ถ”์ 
if problem_avg_accuracy == 0.0:
results['reasoning_task_stats'][task_type]['accuracy_0_count'] += 1
elif problem_avg_accuracy == 1.0:
results['reasoning_task_stats'][task_type]['accuracy_1_count'] += 1
# partial accuracy๋Š” 0 < acc < 1 (์ž๋™์œผ๋กœ ๊ณ„์‚ฐ๋จ)
# ๋ฌธ์ œ๋ณ„ ๊ฒฐ๊ณผ ์ €์žฅ (์‹œ๊ฐ„ ์ •๋ณด ํฌํ•จ)
problem_result = {
'problem_id': problem_id,
'success': result['success'],
'error': result.get('error'),
'step_results': step_results,
'initial_solution_correct': initial_solution_correct,
'reasoning_tasks_correct': {},
'time_seconds': problem_duration
}
if result['success']:
# Reasoning tasks ๊ฒฐ๊ณผ (์ƒ์„ธํ•œ ์ •ํ™•๋„ ์ •๋ณด ํฌํ•จ)
reward_step = result['steps'].get('reward_computation', {})
rewards = reward_step.get('rewards', {})
for task_type, type_rewards in rewards.get('rewards_by_type', {}).items():
correct_count = sum(1 for r in type_rewards if r['basic_accuracy'] > 0)
total_count = len(type_rewards)
accuracy_0_count = sum(1 for r in type_rewards if r['basic_accuracy'] == 0)
accuracy_1_count = sum(1 for r in type_rewards if r['basic_accuracy'] == 1)
# ์ด problem์—์„œ์˜ ํ‰๊ท  accuracy
problem_average = sum(r['basic_accuracy'] for r in type_rewards) / len(type_rewards) if type_rewards else 0.0
problem_result['reasoning_tasks_correct'][task_type] = {
'correct_count': correct_count,
'total_count': total_count,
'accuracy_0_count': accuracy_0_count,
'accuracy_1_count': accuracy_1_count,
'problem_average_accuracy': problem_average,
'summary': f"{correct_count}/{total_count} (avg: {problem_average:.3f})"
}
# ์‹œ๊ฐ„ ์ •๋ณด ์ถ”๊ฐ€
results['timing_stats']['problem_times'].append({
'problem_id': problem_id,
'time_seconds': problem_duration,
'time_formatted': f"{problem_duration:.2f}s"
})
results['problem_results'].append(problem_result)
# ์ง„ํ–‰ ์ƒํ™ฉ ๋กœ๊น…
if result['success']:
logger.log_info(f" โœ… Success - Initial: {'โœ…' if problem_result['initial_solution_correct'] else 'โŒ'}")
else:
logger.log_error(f" โŒ Failed: {result.get('error', 'Unknown error')}")
except Exception as e:
# ์˜ˆ์™ธ ๋ฐœ์ƒ์‹œ์—๋„ ์‹œ๊ฐ„ ์ธก์ •
problem_end_time = time.time()
problem_duration = problem_end_time - problem_start_time
logger.log_error(f" ๐Ÿ’ฅ Exception during pipeline execution: {e}")
logger.log_error(f" ๐Ÿ“‹ Problem Loading: โŒ (Exception)")
logger.log_error(f" ๐Ÿค– LLM Generation: โŒ (Exception)")
logger.log_error(f" ๐Ÿ“Š Solution Evaluation: โŒ (Exception)")
logger.log_error(f" ๐Ÿ” IPO Extraction: โŒ (Exception)")
logger.log_error(f" ๐Ÿ“ Task Generation: โŒ (Exception)")
logger.log_error(f" ๐Ÿง  Task Evaluation: โŒ (Exception)")
# ์˜ˆ์™ธ ๋ฐœ์ƒ์‹œ ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ
results['initial_solution_stats']['total'] += 1
# ์˜ˆ์™ธ ๋ฐœ์ƒ์‹œ ๋ชจ๋“  ์‹คํŒจ ๋ชฉ๋ก์— ์ถ”๊ฐ€
if problem_id not in results['initial_solution_stats']['first_attempt_failed_problem_ids']:
results['initial_solution_stats']['first_attempt_failed_problem_ids'].append(problem_id)
if problem_id not in results['initial_solution_stats']['never_success_problem_ids']:
results['initial_solution_stats']['never_success_problem_ids'].append(problem_id)
if problem_id not in results['initial_solution_stats']['failed_problem_ids']:
results['initial_solution_stats']['failed_problem_ids'].append(problem_id)
results['ipo_extraction_stats']['total_attempts'] += 1
results['ipo_extraction_stats']['failed'] += 1
if problem_id not in results['ipo_extraction_stats']['failed_problem_ids']:
results['ipo_extraction_stats']['failed_problem_ids'].append(problem_id)
# ์˜ˆ์™ธ ๋ฐœ์ƒ์‹œ์—๋„ ๋ฌธ์ œ ๊ฒฐ๊ณผ ์ถ”๊ฐ€ (๋‹จ๊ณ„๋ณ„ ์ •๋ณด ํฌํ•จ)
results['problem_results'].append({
'problem_id': problem_id,
'success': False,
'error': str(e),
'step_results': {
'problem_loading': False,
'llm_generation': False,
'solution_evaluation': False,
'ipo_extraction': False,
'input_generation': False,
'task_generation': False,
'task_evaluation': False
},
'initial_solution_correct': False,
'reasoning_tasks_correct': {},
'time_seconds': problem_duration
})
# ์‹œ๊ฐ„ ์ •๋ณด ์ถ”๊ฐ€
results['timing_stats']['problem_times'].append({
'problem_id': problem_id,
'time_seconds': problem_duration,
'time_formatted': f"{problem_duration:.2f}s"
})
# ์ „์ฒด ์‹คํ–‰ ์‹œ๊ฐ„ ๊ณ„์‚ฐ
end_total_time = time.time()
total_duration = end_total_time - start_total_time
# ์‹œ๊ฐ„ ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ
results['timing_stats']['total_time_seconds'] = total_duration
if len(problems) > 0:
results['timing_stats']['average_time_per_problem'] = total_duration / len(problems)
# ์ตœ์ข… ํ†ต๊ณ„ ๊ณ„์‚ฐ
logger.log_info("๐Ÿ“Š Computing final statistics...")
# Input generation ํ‰๊ท  ๊ณ„์‚ฐ
input_stats = results['input_generation_stats']
if input_stats['successful'] > 0:
input_stats['average_inputs_per_problem'] = input_stats['total_generated_inputs'] / input_stats['successful']
# Current evaluation ํ‰๊ท  ๊ณ„์‚ฐ
current_stats = results['current_evaluation_stats']
if current_stats['total_rounds'] > 0:
current_stats['average_success_rate'] = current_stats['total_success_rounds'] / current_stats['total_rounds']
# Diverse programs ํ‰๊ท  ๊ณ„์‚ฐ
diverse_stats = results['diverse_programs_stats']
if diverse_stats['successful'] > 0:
diverse_stats['average_programs_per_problem'] = diverse_stats['total_programs_generated'] / diverse_stats['successful']
diverse_stats['average_ipo_per_problem'] = diverse_stats['total_ipo_triples'] / diverse_stats['successful']
# ์‹œ๊ฐ„ ํ†ต๊ณ„ ํ‘œ์‹œ
logger.log_info(f"โฑ๏ธ Total execution time: {total_duration:.2f}s ({total_duration/60:.1f}min)")
logger.log_info(f"โฑ๏ธ Average time per problem: {results['timing_stats']['average_time_per_problem']:.2f}s")
# ์ดˆ๊ธฐ ์†”๋ฃจ์…˜ ์ •ํ™•๋ฅ  (3๊ฐ€์ง€ ๊ธฐ์ค€)
initial_stats = results['initial_solution_stats']
if initial_stats['total'] > 0:
# 1. ์ฒซ ๋ฒˆ์งธ ์‹œ๋„ ์ •ํ™•๋„
first_attempt_accuracy = initial_stats['first_attempt_correct'] / initial_stats['total']
logger.log_info(f"๐Ÿ“ˆ First Attempt Accuracy: {first_attempt_accuracy:.3f} ({initial_stats['first_attempt_correct']}/{initial_stats['total']})")
# 2. 5๋ฒˆ ์ค‘ 1๋ฒˆ์ด๋ผ๋„ ์„ฑ๊ณต ์ •ํ™•๋„
at_least_once_accuracy = initial_stats['at_least_once_correct'] / initial_stats['total']
logger.log_info(f"๐Ÿ“ˆ At-Least-Once Success Rate: {at_least_once_accuracy:.3f} ({initial_stats['at_least_once_correct']}/{initial_stats['total']})")
# 3. 5๋ฒˆ ํ‰๊ท  ์ •ํ™•๋„
if initial_stats['total_attempts'] > 0:
average_accuracy = initial_stats['total_successes'] / initial_stats['total_attempts']
logger.log_info(f"๐Ÿ“ˆ Average Success Rate (5 attempts): {average_accuracy:.3f} ({initial_stats['total_successes']}/{initial_stats['total_attempts']})")
logger.log_info(f"๐Ÿ“ˆ First attempt failed problems: {len(initial_stats['first_attempt_failed_problem_ids'])}/{initial_stats['total']}")
logger.log_info(f"๐Ÿ“ˆ Never success problems: {len(initial_stats['never_success_problem_ids'])}/{initial_stats['total']}")
# IPO ์ถ”์ถœ ํ†ต๊ณ„
ipo_stats = results['ipo_extraction_stats']
if ipo_stats['total_attempts'] > 0:
ipo_success_rate = ipo_stats['successful'] / ipo_stats['total_attempts']
logger.log_info(f"๐Ÿ”— IPO Extraction Success Rate: {ipo_success_rate:.3f} ({ipo_stats['successful']}/{ipo_stats['total_attempts']})")
logger.log_info(f"๐Ÿ”— IPO Extraction Failed: {ipo_stats['failed']} problems")
# Input generation ํ†ต๊ณ„
if input_stats['total_attempts'] > 0:
input_success_rate = input_stats['successful'] / input_stats['total_attempts']
logger.log_info(f"๐ŸŽฒ Input Generation Success Rate: {input_success_rate:.3f} ({input_stats['successful']}/{input_stats['total_attempts']})")
logger.log_info(f"๐ŸŽฒ Total Generated Inputs: {input_stats['total_generated_inputs']}")
logger.log_info(f"๐ŸŽฒ Average Inputs per Problem: {input_stats['average_inputs_per_problem']:.2f}")
# Current evaluation ํ†ต๊ณ„
if current_stats['total_attempts'] > 0:
current_success_rate = current_stats['successful'] / current_stats['total_attempts']
logger.log_info(f"๐Ÿ“Š Current Evaluation Success Rate: {current_success_rate:.3f} ({current_stats['successful']}/{current_stats['total_attempts']})")
logger.log_info(f"๐Ÿ“Š Total Current Rounds: {current_stats['total_rounds']}")
logger.log_info(f"๐Ÿ“Š Average Success Rate: {current_stats['average_success_rate']:.3f}")
# Diverse programs ํ†ต๊ณ„
if diverse_stats['total_attempts'] > 0:
diverse_success_rate = diverse_stats['successful'] / diverse_stats['total_attempts']
logger.log_info(f"๐ŸŽจ Diverse Programs Success Rate: {diverse_success_rate:.3f} ({diverse_stats['successful']}/{diverse_stats['total_attempts']})")
logger.log_info(f"๐ŸŽจ Total Programs Generated: {diverse_stats['total_programs_generated']}")
logger.log_info(f"๐ŸŽจ Total Valid Programs: {diverse_stats['total_valid_programs']}")
logger.log_info(f"๐ŸŽจ Total IPO Triples: {diverse_stats['total_ipo_triples']}")
logger.log_info(f"๐ŸŽจ Average Programs per Problem: {diverse_stats['average_programs_per_problem']:.2f}")
logger.log_info(f"๐ŸŽจ Average IPO per Problem: {diverse_stats['average_ipo_per_problem']:.2f}")
# Reasoning tasks ์ •ํ™•๋ฅ  (์ƒ์„ธ ์ •๋ณด ํฌํ•จ)
for task_type, stats in results['reasoning_task_stats'].items():
if stats['total'] > 0:
task_accuracy = stats['correct'] / stats['total']
logger.log_info(f"๐Ÿ“ˆ {task_type.title()} Task Accuracy: {task_accuracy:.3f} ({stats['correct']}/{stats['total']})")
logger.log_info(f" - Accuracy=0: {stats['accuracy_0_count']}, Accuracy=1: {stats['accuracy_1_count']}")
# ๊ฒฐ๊ณผ ํŒŒ์ผ ์ €์žฅ
result_file = os.path.join(output_dir, f"batch_evaluation_results.json")
with open(result_file, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
# ์š”์•ฝ ๋ฆฌํฌํŠธ ์ƒ์„ฑ (ํ–ฅ์ƒ๋œ ํ†ต๊ณ„ ํฌํ•จ)
summary_file = os.path.join(output_dir, f"evaluation_summary.md")
with open(summary_file, 'w', encoding='utf-8') as f:
f.write(f"# TestTime RLVR Batch Evaluation Report\n\n")
f.write(f"**Model**: {args.model}\n")
f.write(f"**Benchmark**: {args.benchmark}\n")
f.write(f"**Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"**Total Problems**: {results['initial_solution_stats']['total']}\n")
f.write(f"**Output Directory**: `{output_dir}`\n\n")
f.write(f"## Directory Structure\n")
f.write(f"```\n")
f.write(f"{output_dir}/\n")
f.write(f"โ”œโ”€โ”€ batch_evaluation_results.json # ์ „์ฒด ํ†ต๊ณ„ ๊ฒฐ๊ณผ\n")
f.write(f"โ”œโ”€โ”€ evaluation_summary.md # ์ด ์š”์•ฝ ํŒŒ์ผ\n")
f.write(f"โ””โ”€โ”€ {args.benchmark}/ # ๋ฒค์น˜๋งˆํฌ๋ณ„ ์ƒ์„ธ ๊ฒฐ๊ณผ\n")
f.write(f" โ””โ”€โ”€ [problem_id]/ # ๊ฐ ๋ฌธ์ œ๋ณ„ ๋””๋ ‰ํ† ๋ฆฌ\n")
f.write(f" โ”œโ”€โ”€ initial_solution/ # ์ดˆ๊ธฐ LLM ์†”๋ฃจ์…˜\n")
f.write(f" โ”œโ”€โ”€ ipo_triples/ # IPO ํŠธ๋ฆฌํ”Œ\n")
f.write(f" โ”œโ”€โ”€ task_prompts/ # ์ƒ์„ฑ๋œ ํƒœ์Šคํฌ\n")
f.write(f" โ”œโ”€โ”€ llm_responses/ # LLM ์‘๋‹ต\n")
f.write(f" โ””โ”€โ”€ [problem_id]_summary.json # ๋ฌธ์ œ๋ณ„ ์š”์•ฝ\n")
f.write(f"```\n\n")
# ์‹œ๊ฐ„ ํ†ต๊ณ„ ์„น์…˜
f.write(f"## Timing Statistics\n")
f.write(f"- **Total Execution Time**: {total_duration:.2f}s ({total_duration/60:.1f} minutes)\n")
f.write(f"- **Average Time per Problem**: {results['timing_stats']['average_time_per_problem']:.2f}s\n")
f.write(f"- **Fastest Problem**: {min(results['timing_stats']['problem_times'], key=lambda x: x['time_seconds'])['time_formatted']} ({min(results['timing_stats']['problem_times'], key=lambda x: x['time_seconds'])['problem_id']})\n")
f.write(f"- **Slowest Problem**: {max(results['timing_stats']['problem_times'], key=lambda x: x['time_seconds'])['time_formatted']} ({max(results['timing_stats']['problem_times'], key=lambda x: x['time_seconds'])['problem_id']})\n\n")
f.write(f"## Current Evaluation Performance (5 attempts per problem)\n\n")
# 1. ์ฒซ ๋ฒˆ์งธ ์‹œ๋„ ์ •ํ™•๋„
first_attempt_accuracy = initial_stats['first_attempt_correct'] / initial_stats['total'] if initial_stats['total'] > 0 else 0
f.write(f"### 1. First Attempt Accuracy\n")
f.write(f"- **Accuracy**: {first_attempt_accuracy:.3f} ({initial_stats['first_attempt_correct']}/{initial_stats['total']})\n")
f.write(f"- **Description**: Success rate based on first attempt only\n\n")
# 2. 5๋ฒˆ ์ค‘ 1๋ฒˆ์ด๋ผ๋„ ์„ฑ๊ณต
at_least_once_accuracy = initial_stats['at_least_once_correct'] / initial_stats['total'] if initial_stats['total'] > 0 else 0
f.write(f"### 2. At-Least-Once Success Rate\n")
f.write(f"- **Accuracy**: {at_least_once_accuracy:.3f} ({initial_stats['at_least_once_correct']}/{initial_stats['total']})\n")
f.write(f"- **Description**: Problems where at least 1 out of 5 attempts succeeded\n\n")
# 3. 5๋ฒˆ ํ‰๊ท  ์ •ํ™•๋„
if initial_stats['total_attempts'] > 0:
average_accuracy = initial_stats['total_successes'] / initial_stats['total_attempts']
f.write(f"### 3. Average Success Rate (5 attempts)\n")
f.write(f"- **Accuracy**: {average_accuracy:.3f}\n")
f.write(f"- **Description**: Average of individual problem success rates across 5 attempts\n")
f.write(f"- **Total Evaluations**: {initial_stats['total_attempts']} ({initial_stats['total']} ร— 5)\n")
f.write(f"- **Total Successes**: {initial_stats['total_successes']}\n\n")
# ๊ธฐํƒ€ ํ†ต๊ณ„
f.write(f"### Additional Statistics\n")
f.write(f"- **Syntax Errors**: {initial_stats['syntax_errors']}\n")
f.write(f"- **Evaluation Errors**: {initial_stats['evaluation_errors']}\n\n")
# ๋‹จ๊ณ„๋ณ„ ์„ฑ๊ณต ํ†ต๊ณ„ ์ถ”๊ฐ€
f.write(f"## Pipeline Step Success Statistics\n")
# ๊ฐ ๋‹จ๊ณ„๋ณ„ ์„ฑ๊ณต ๊ฐœ์ˆ˜ ๊ณ„์‚ฐ
step_stats = {
'problem_loading': 0,
'llm_generation': 0,
'solution_evaluation': 0,
'ipo_extraction': 0,
'input_generation': 0,
'task_generation': 0,
'task_evaluation': 0
}
for problem_result in results['problem_results']:
if 'step_results' in problem_result:
for step, success in problem_result['step_results'].items():
if success:
step_stats[step] += 1
total_problems = results['initial_solution_stats']['total']
f.write(f"- **Problem Loading**: {step_stats['problem_loading']}/{total_problems} ({step_stats['problem_loading']/total_problems*100:.1f}%)\n")
f.write(f"- **LLM Generation**: {step_stats['llm_generation']}/{total_problems} ({step_stats['llm_generation']/total_problems*100:.1f}%)\n")
f.write(f"- **Solution Evaluation**: {step_stats['solution_evaluation']}/{total_problems} ({step_stats['solution_evaluation']/total_problems*100:.1f}%)\n")
f.write(f"- **IPO Extraction**: {step_stats['ipo_extraction']}/{total_problems} ({step_stats['ipo_extraction']/total_problems*100:.1f}%)\n")
f.write(f"- **Input Generation**: {step_stats['input_generation']}/{total_problems} ({step_stats['input_generation']/total_problems*100:.1f}%)\n")
f.write(f"- **Task Generation**: {step_stats['task_generation']}/{total_problems} ({step_stats['task_generation']/total_problems*100:.1f}%)\n")
f.write(f"- **Task Evaluation**: {step_stats['task_evaluation']}/{total_problems} ({step_stats['task_evaluation']/total_problems*100:.1f}%)\n\n")
# IPO ์ถ”์ถœ ํ†ต๊ณ„ ์„น์…˜
ipo_stats = results['ipo_extraction_stats']
if ipo_stats['total_attempts'] > 0:
ipo_success_rate = ipo_stats['successful'] / ipo_stats['total_attempts']
f.write(f"## IPO Extraction Performance\n")
f.write(f"- **Total Attempts**: {ipo_stats['total_attempts']}\n")
f.write(f"- **Successful**: {ipo_stats['successful']}\n")
f.write(f"- **Failed**: {ipo_stats['failed']}\n")
f.write(f"- **Success Rate**: {ipo_success_rate:.3f}\n\n")
# IPO ์ถ”์ถœ ์‹คํŒจ ๋ฌธ์ œ ID ๋ชฉ๋ก
if ipo_stats['failed_problem_ids']:
f.write(f"### IPO Extraction Failed Problem IDs\n")
for problem_id in ipo_stats['failed_problem_ids']:
f.write(f"- `{problem_id}`\n")
f.write(f"\n")
# Input Generation ํ†ต๊ณ„ ์„น์…˜ ์ถ”๊ฐ€
input_gen_stats = results.get('input_generation_stats', {})
if input_gen_stats and input_gen_stats['total_attempts'] > 0:
gen_success_rate = input_gen_stats['successful'] / input_gen_stats['total_attempts']
f.write(f"## Input Generation Performance\n")
f.write(f"- **Total Attempts**: {input_gen_stats['total_attempts']}\n")
f.write(f"- **Successful**: {input_gen_stats['successful']}\n")
f.write(f"- **Failed**: {input_gen_stats['failed']}\n")
f.write(f"- **Success Rate**: {gen_success_rate:.3f}\n")
f.write(f"- **Total Generated Inputs**: {input_gen_stats['total_generated_inputs']}\n")
f.write(f"- **Average Inputs per Problem**: {input_gen_stats['average_inputs_per_problem']:.2f}\n\n")
# ์ž…๋ ฅ ์ƒ์„ฑ์ด ์ˆ˜ํ–‰๋œ ๋ฌธ์ œ ๋ชฉ๋ก
if input_gen_stats.get('problems_with_generation'):
f.write(f"### Problems with Input Generation\n")
f.write(f"Total: {len(input_gen_stats['problems_with_generation'])} problems\n")
# ์ฒ˜์Œ 10๊ฐœ๋งŒ ํ‘œ์‹œ
for i, problem_id in enumerate(input_gen_stats['problems_with_generation'][:10]):
f.write(f"- `{problem_id}`\n")
if len(input_gen_stats['problems_with_generation']) > 10:
f.write(f"- ... and {len(input_gen_stats['problems_with_generation']) - 10} more\n")
f.write(f"\n")
# ๋ฌธ์ œ ID ๋ถ„๋ฅ˜ ์„น์…˜
f.write(f"## Problem Classification\n\n")
# ์ฒซ ๋ฒˆ์งธ ์‹œ๋„ ๊ธฐ์ค€ ๋ถ„๋ฅ˜
f.write(f"### ๐Ÿ“ˆ First Attempt Results\n")
f.write(f"- **Success**: {initial_stats['first_attempt_correct']} problems\n")
f.write(f"- **Failure**: {len(initial_stats['first_attempt_failed_problem_ids'])} problems\n\n")
# 5๋ฒˆ ์‹œ๋„ ์ข…ํ•ฉ ๋ถ„๋ฅ˜
f.write(f"### ๐Ÿ“Š Five-Attempt Results\n")
f.write(f"- **At-Least-Once Success**: {initial_stats['at_least_once_correct']} problems\n")
f.write(f"- **Never Success**: {len(initial_stats['never_success_problem_ids'])} problems\n\n")
# ์ฒซ ์‹œ๋„ ์‹คํŒจ ๋ฌธ์ œ ID ๋ชฉ๋ก
if initial_stats['first_attempt_failed_problem_ids']:
f.write(f"### First Attempt Failed Problem IDs\n")
for problem_id in initial_stats['first_attempt_failed_problem_ids']:
f.write(f"- `{problem_id}`\n")
f.write(f"\n")
# 5๋ฒˆ ๋ชจ๋‘ ์‹คํŒจ ๋ฌธ์ œ ID ๋ชฉ๋ก
if initial_stats['never_success_problem_ids']:
f.write(f"### Never Success Problem IDs (0/5)\n")
for problem_id in initial_stats['never_success_problem_ids']:
f.write(f"- `{problem_id}`\n")
f.write(f"\n")
f.write(f"## Reasoning Task Performance\n")
f.write(f"*Note: Statistics based on problem-level average accuracy for each task type*\n\n")
for task_type, stats in results['reasoning_task_stats'].items():
if stats['total'] > 0:
# Overall Success Rate = ์ „์ฒด task์˜ ํ‰๊ท  ์ •ํ™•๋„
overall_accuracy = stats['total_accuracy'] / stats['total']
partial_count = stats['total'] - stats['accuracy_0_count'] - stats['accuracy_1_count']
f.write(f"### {task_type.title()} Tasks\n")
f.write(f"- **Total Problems**: {stats['total']} (problems that had {task_type} tasks)\n")
f.write(f"- **Problems with >0 Avg Accuracy**: {stats['correct']}\n")
f.write(f"- **Overall Success Rate**: {overall_accuracy:.3f}\n")
f.write(f"- **Problems with Avg Accuracy = 0.0**: {stats['accuracy_0_count']} problems\n")
f.write(f"- **Problems with Avg Accuracy = 1.0**: {stats['accuracy_1_count']} problems\n")
f.write(f"- **Problems with Partial Accuracy**: {partial_count} problems\n\n")
# ์ƒ์„ธํ•œ ๋ฌธ์ œ ๋ถ„๋ฅ˜ ์ถ”๊ฐ€
f.write(generate_detailed_classification(output_dir, args.benchmark))
f.write(f"## Files\n")
f.write(f"- **Detailed Results**: {result_file}\n")
f.write(f"- **Summary Report**: {summary_file}\n")
f.write(f"- **First Attempt Failed Problems**: See 'First Attempt Failed Problem IDs' section above\n")
f.write(f"- **Never Success Problems**: See 'Never Success Problem IDs' section above\n")
if ipo_stats['failed_problem_ids']:
f.write(f"- **IPO Extraction Failed Problems**: See 'IPO Extraction Failed Problem IDs' section above and ipo_extraction_failed_problems.txt\n")
# IPO ์ถ”์ถœ ์‹คํŒจ ๋ฌธ์ œ ID ๋ณ„๋„ ํŒŒ์ผ๋กœ ์ €์žฅ
if ipo_stats['failed_problem_ids']:
failed_ipo_file = os.path.join(output_dir, f"ipo_extraction_failed_problems.txt")
with open(failed_ipo_file, 'w', encoding='utf-8') as f:
f.write(f"# IPO Extraction Failed Problems\n")
f.write(f"# Benchmark: {args.benchmark}\n")
f.write(f"# Model: {args.model}\n")
f.write(f"# Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"# Total Failed: {len(ipo_stats['failed_problem_ids'])}/{ipo_stats['total_attempts']}\n")
f.write(f"# Success Rate: {(ipo_stats['successful'] / ipo_stats['total_attempts']):.3f}\n")
f.write(f"#\n")
for problem_id in ipo_stats['failed_problem_ids']:
f.write(f"{problem_id}\n")
logger.log_info(f"๐Ÿ“„ IPO extraction failed problems saved: {failed_ipo_file}")
logger.log_info(f"โœ… Batch evaluation completed!")
logger.log_info(f"๐Ÿ“ Results saved to: {output_dir}")
logger.log_info(f" ๐Ÿ“„ Summary report: evaluation_summary.md")
logger.log_info(f" ๐Ÿ“Š Statistics JSON: batch_evaluation_results.json")
logger.log_info(f" ๐Ÿ“‚ Detailed results: {args.benchmark}/[problem_id]/")
logger.log_info(f" โ””โ”€โ”€ initial_solution/ # LLM ์†”๋ฃจ์…˜")
logger.log_info(f" โ””โ”€โ”€ ipo_triples/ # IPO ํŠธ๋ฆฌํ”Œ")
logger.log_info(f" โ””โ”€โ”€ task_prompts/ # ์ƒ์„ฑ๋œ ํƒœ์Šคํฌ")
logger.log_info(f" โ””โ”€โ”€ llm_responses/ # LLM ์‘๋‹ต")
if ipo_stats['failed_problem_ids']:
logger.log_info(f"๐Ÿ“„ IPO failed problems: {len(ipo_stats['failed_problem_ids'])} problems saved to ipo_extraction_failed_problems.txt")
# ๋ชจ๋ธ ์ •๋ฆฌ (VLLM ์˜ฌ๋ฐ”๋ฅธ ์ข…๋ฃŒ)
try:
import gc
import torch
# 1. VLLM ๋ชจ๋ธ ์ •๋ฆฌ (์˜ฌ๋ฐ”๋ฅธ ๋ฐฉ๋ฒ•)
if hasattr(model, 'llm_engine'):
# LLMEngine์˜ model_executor ์ง์ ‘ shutdown
if hasattr(model.llm_engine, 'model_executor'):
logger.log_info("๐Ÿ”„ Shutting down VLLM model executor...")
model.llm_engine.model_executor.shutdown()
# ๊ฐ์ฒด ์ฐธ์กฐ ๋ช…์‹œ์  ํ•ด์ œ
del model.llm_engine
# 2. ๋ชจ๋ธ ๊ฐ์ฒด ์ฐธ์กฐ ํ•ด์ œ
del model
# 3. GPU ๋ฉ”๋ชจ๋ฆฌ ์ •๋ฆฌ
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
# 4. ๊ฐ•์ œ ๊ฐ€๋น„์ง€ ์ปฌ๋ ‰์…˜
gc.collect()
logger.log_info("๐Ÿงน Model cleanup completed properly")
except Exception as e:
logger.log_warning(f"โš ๏ธ Model cleanup failed: {e}")
# ๋ฐฑ์—…: ๊ฐ•์ œ ์ข…๋ฃŒ (๋ฌธ์ œ๊ฐ€ ์žˆ์„ ๊ฒฝ์šฐ์—๋งŒ)
logger.log_warning("๐Ÿšจ Attempting emergency cleanup...")
try:
import psutil
# VLLM ๊ด€๋ จ ํ”„๋กœ์„ธ์Šค ๊ฐ•์ œ ์ข…๋ฃŒ
current_pid = os.getpid()
parent = psutil.Process(current_pid)
for child in parent.children(recursive=True):
try:
child.terminate()
child.wait(timeout=2)
except (psutil.NoSuchProcess, psutil.TimeoutExpired):
try:
child.kill()
except psutil.NoSuchProcess:
pass
logger.log_warning("๐Ÿšจ Emergency cleanup completed")
except Exception as cleanup_error:
logger.log_error(f"๐Ÿ’ฅ Emergency cleanup also failed: {cleanup_error}")
# ์ตœํ›„์˜ ์ˆ˜๋‹จ
try:
os._exit(0)
except:
pass
return True
def main():
parser = argparse.ArgumentParser(description='Batch TestTime RLVR Evaluation')
parser.add_argument('--model', type=str, default='Qwen/Qwen2.5-7B',
help='Model name to evaluate')
parser.add_argument('--benchmark', type=str, choices=['humaneval', 'mbpp'],
default='mbpp', help='Benchmark to evaluate')
parser.add_argument('--max_problems', type=int, default=10,
help='Maximum number of problems to evaluate (0 = all)')
parser.add_argument('--gpu', type=int, default=6, help='GPU ID to use')
parser.add_argument('--output_dir', type=str,
default='./batch_results',
help='Output directory for results')
parser.add_argument('--resume', action='store_true',
help='Resume from previously completed problems')
parser.add_argument('--start_from', type=str, default=None,
help='Start from specific problem ID (e.g., Mbpp/100)')
args = parser.parse_args()
# GPU ์„ค์ • (Shell์—์„œ CUDA_VISIBLE_DEVICES๊ฐ€ ์ด๋ฏธ ์„ค์ •๋œ ๊ฒฝ์šฐ ์œ ์ง€)
if 'CUDA_VISIBLE_DEVICES' not in os.environ:
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
print(f"๐ŸŽฏ CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set')}")
print(f"๐ŸŽฏ Using GPU argument: {args.gpu}")
# ๊ฒฐ๊ณผ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
os.makedirs(args.output_dir, exist_ok=True)
try:
success = run_batch_evaluation(args)
exit_code = 0 if success else 1
except Exception as e:
print(f"๐Ÿ’ฅ Batch evaluation failed: {e}")
traceback.print_exc()
exit_code = 1
print(f"\n๐Ÿšช Exiting with code {exit_code}")
# ๊ฐ•์ œ ์ข…๋ฃŒ (VLLM ํ”„๋กœ์„ธ์Šค ์™„์ „ ์ข…๋ฃŒ๋ฅผ ์œ„ํ•ด)
try:
os._exit(exit_code)
except:
sys.exit(exit_code)
if __name__ == '__main__':
main()