|
|
|
""" |
|
Batch TestTime RLVR Evaluation Script |
|
|
|
๋ฒค์น๋งํฌ ์ ์ฒด์ ๋ํ์ฌ TestTime RLVR ํ์ดํ๋ผ์ธ์ ์คํํ๊ณ |
|
์ด๊ธฐ ์๋ฃจ์
์ ํ์ฑ ๋ฐ reasoning tasks ์ฑ๋ฅ์ ํ๊ฐํฉ๋๋ค. |
|
""" |
|
|
|
import os |
|
import sys |
|
import json |
|
import argparse |
|
import time |
|
import re |
|
from pathlib import Path |
|
from datetime import datetime |
|
from typing import Dict, List, Any |
|
import traceback |
|
|
|
|
|
sys.path.append('/home/ubuntu/RLVR/TestTime-RLVR-v2') |
|
from absolute_zero_reasoner.testtime.complete_pipeline import CompleteTestTimePipeline |
|
from absolute_zero_reasoner.testtime.config import TestTimeConfig, BenchmarkConfig |
|
from absolute_zero_reasoner.testtime.logger import TestTimeLogger |
|
from absolute_zero_reasoner.testtime.solution_generator import InitialSolutionGenerator |
|
from absolute_zero_reasoner.testtime.prompts import get_prompt, get_diversity_instruction |
|
|
|
|
|
def generate_detailed_classification(output_dir: str, benchmark: str) -> str: |
|
"""๋ฐฐ์น ํ๊ฐ ๊ฒฐ๊ณผ๋ฅผ 4๊ฐ์ง ์นดํ
๊ณ ๋ฆฌ๋ก ์์ธ ๋ถ๋ฅ""" |
|
|
|
base_dir = os.path.join(output_dir, benchmark) |
|
|
|
if not os.path.exists(base_dir): |
|
return f"## ๐ Detailed Problem Classification\n\nโ ๏ธ Benchmark directory not found: {base_dir}\n\n" |
|
|
|
|
|
complete_success = [] |
|
partial_success = [] |
|
complete_failure = [] |
|
execution_failure = [] |
|
|
|
|
|
for problem_dir in sorted(Path(base_dir).iterdir()): |
|
if not problem_dir.is_dir(): |
|
continue |
|
|
|
problem_id = problem_dir.name |
|
|
|
current_eval_file = problem_dir / "current_evaluation" / "attempt_1.txt" |
|
|
|
if not current_eval_file.exists(): |
|
execution_failure.append(f"{problem_id} (file not found)") |
|
continue |
|
|
|
|
|
try: |
|
with open(current_eval_file, 'r', encoding='utf-8') as f: |
|
content = f.read() |
|
|
|
|
|
result_pattern = r'Result: (.+) \((\d+)/(\d+) tests passed\)' |
|
match = re.search(result_pattern, content) |
|
|
|
if match: |
|
status = match.group(1) |
|
passed = int(match.group(2)) |
|
total = int(match.group(3)) |
|
|
|
if total == 0: |
|
execution_failure.append(f"{problem_id} (0 total tests)") |
|
elif passed == total: |
|
complete_success.append(problem_id) |
|
elif passed == 0: |
|
complete_failure.append(problem_id) |
|
else: |
|
ratio = passed / total * 100 |
|
partial_success.append((problem_id, passed, total, ratio)) |
|
else: |
|
execution_failure.append(f"{problem_id} (no result pattern)") |
|
|
|
except Exception as e: |
|
if "division by zero" in str(e): |
|
execution_failure.append(f"{problem_id} (division by zero)") |
|
else: |
|
execution_failure.append(f"{problem_id} (error: {str(e)[:50]})") |
|
|
|
|
|
partial_success.sort(key=lambda x: x[3]) |
|
|
|
|
|
result = "## ๐ Detailed Problem Classification\n\n" |
|
|
|
result += f"### ๐ข Complete Success (Baseline = 100%)\n" |
|
result += f"**Count: {len(complete_success)} problems**\n" |
|
result += "**Task IDs:**\n" |
|
|
|
for i in range(0, len(complete_success), 10): |
|
line_tasks = complete_success[i:i+10] |
|
result += "- " + ", ".join(line_tasks) + "\n" |
|
result += "\n" |
|
|
|
result += f"### ๐ก Partial Success (0% < Baseline < 100%)\n" |
|
result += f"**Count: {len(partial_success)} problems**\n" |
|
result += "**Task IDs (ordered by success rate, lowest first):**\n" |
|
for problem_id, passed, total, ratio in partial_success: |
|
result += f"- {problem_id}: {passed}/{total} ({ratio:.1f}%)\n" |
|
result += "\n" |
|
|
|
result += f"### ๐ด Complete Failure (Baseline = 0%)\n" |
|
result += f"**Count: {len(complete_failure)} problems**\n" |
|
result += "**Task IDs:**\n" |
|
|
|
for i in range(0, len(complete_failure), 10): |
|
line_tasks = complete_failure[i:i+10] |
|
result += "- " + ", ".join(line_tasks) + "\n" |
|
result += "\n" |
|
|
|
result += f"### โ Execution Failure (Syntax/Import/Runtime Errors)\n" |
|
result += f"**Count: {len(execution_failure)} problems**\n" |
|
result += "**Task IDs:**\n" |
|
for task in execution_failure: |
|
result += f"- {task}\n" |
|
result += "\n" |
|
|
|
result += f"### ๐ Summary Statistics\n" |
|
total_analyzed = len(complete_success) + len(partial_success) + len(complete_failure) + len(execution_failure) |
|
if total_analyzed > 0: |
|
result += f"- Total Problems with Results: {total_analyzed}\n" |
|
result += f"- Baseline Success Rate: {len(complete_success)/total_analyzed*100:.1f}%\n" |
|
result += f"- Partial Success Rate: {len(partial_success)/total_analyzed*100:.1f}%\n" |
|
result += f"- Complete Failure Rate: {len(complete_failure)/total_analyzed*100:.1f}%\n" |
|
result += f"- Execution Failure Rate: {len(execution_failure)/total_analyzed*100:.1f}%\n" |
|
result += f"\n**Note**: This analysis is based on baseline evaluation (attempt_1.txt) results.\n" |
|
result += f"Problems that failed during early pipeline stages may not appear in these statistics.\n" |
|
result += "\n" |
|
|
|
return result |
|
|
|
|
|
def load_benchmark_problems(benchmark_config: BenchmarkConfig) -> List[str]: |
|
"""๋ฒค์น๋งํฌ์์ ๋ฌธ์ ID ๋ชฉ๋ก ๋ก๋ (EvalPlus ํ์ค ๋ฐฉ์ ์ฌ์ฉ)""" |
|
|
|
problems = [] |
|
|
|
if benchmark_config.name == 'mbpp': |
|
|
|
try: |
|
from evalplus.data.mbpp import get_mbpp_plus |
|
mbpp_problems = get_mbpp_plus() |
|
problems = list(mbpp_problems.keys()) |
|
print(f"โ
MBPP+ ๋ฐ์ดํฐ ๋ก๋ ์ฑ๊ณต: {len(problems)}๊ฐ ๋ฌธ์ (EvalPlus ํ์ค ๋ฐฉ์)") |
|
except Exception as e: |
|
print(f"โ MBPP+ EvalPlus ๋ก๋ฉ ์คํจ, ๊ธฐ์กด ๋ฐฉ์ ์ฌ์ฉ: {e}") |
|
|
|
data_path = benchmark_config.data_path |
|
if os.path.exists(data_path): |
|
with open(data_path, 'r') as f: |
|
for line in f: |
|
try: |
|
data = json.loads(line.strip()) |
|
if 'task_id' in data: |
|
problems.append(data['task_id']) |
|
except: |
|
continue |
|
|
|
elif benchmark_config.name == 'humaneval': |
|
|
|
try: |
|
from evalplus.data.humaneval import get_human_eval_plus |
|
humaneval_problems = get_human_eval_plus() |
|
problems = list(humaneval_problems.keys()) |
|
print(f"โ
HumanEval+ ๋ฐ์ดํฐ ๋ก๋ ์ฑ๊ณต: {len(problems)}๊ฐ ๋ฌธ์ (EvalPlus ํ์ค ๋ฐฉ์)") |
|
except Exception as e: |
|
print(f"โ HumanEval+ EvalPlus ๋ก๋ฉ ์คํจ, ๊ธฐ์กด ๋ฐฉ์ ์ฌ์ฉ: {e}") |
|
|
|
data_path = benchmark_config.data_path |
|
if os.path.exists(data_path): |
|
with open(data_path, 'r') as f: |
|
for line in f: |
|
try: |
|
data = json.loads(line.strip()) |
|
if 'task_id' in data: |
|
problems.append(data['task_id']) |
|
except: |
|
continue |
|
|
|
return problems |
|
|
|
|
|
def get_completed_problems(output_dir: str) -> set: |
|
"""์๋ฃ๋ ๋ฌธ์ ID ๋ชฉ๋ก ๋ก๋ (resume ๊ธฐ๋ฅ์ฉ)""" |
|
completed = set() |
|
|
|
|
|
json_file = os.path.join(output_dir, "batch_evaluation_results.json") |
|
if os.path.exists(json_file): |
|
try: |
|
with open(json_file, 'r', encoding='utf-8') as f: |
|
data = json.load(f) |
|
for result in data.get('problem_results', []): |
|
problem_id = result.get('problem_id') |
|
if problem_id: |
|
completed.add(problem_id) |
|
except Exception as e: |
|
print(f"โ ๏ธ Warning: Could not load existing results: {e}") |
|
|
|
return completed |
|
|
|
|
|
def save_initial_solution_only(result, output_dir, timestamp, problem_id): |
|
"""LLM Generation ์ฑ๊ณต์ initial_solution๋ง ์ ์ฅ""" |
|
|
|
|
|
benchmark = result.get('benchmark', 'unknown') |
|
problem_id_safe = problem_id.replace('/', '_') |
|
|
|
|
|
base_dir = os.path.join(output_dir, benchmark, problem_id_safe) |
|
os.makedirs(base_dir, exist_ok=True) |
|
|
|
|
|
initial_solution_dir = os.path.join(base_dir, 'initial_solution') |
|
os.makedirs(initial_solution_dir, exist_ok=True) |
|
|
|
|
|
if 'steps' in result and 'llm_generation' in result['steps']: |
|
llm_step = result['steps']['llm_generation'] |
|
|
|
|
|
if 'problem_loading' in result['steps']: |
|
problem_data = result['steps']['problem_loading'].get('problem', {}) |
|
problem_file = os.path.join(initial_solution_dir, f"{problem_id_safe}_original_problem.txt") |
|
with open(problem_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Problem ID: {problem_id}\n") |
|
f.write(f"Benchmark: {benchmark}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n") |
|
f.write("ORIGINAL BENCHMARK PROBLEM:\n") |
|
f.write("="*80 + "\n") |
|
f.write(problem_data.get('prompt', 'No prompt available')) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("FULL LLM PROMPT:\n") |
|
f.write("="*80 + "\n") |
|
|
|
problem_prompt = problem_data.get('prompt', '') |
|
|
|
|
|
if 'HumanEval' in problem_id: |
|
full_prompt = f"""You are a Python writing assistant. Complete the following Python function. |
|
|
|
{problem_prompt} |
|
|
|
Please provide a complete implementation of the function.""" |
|
else: |
|
|
|
full_prompt = f""" |
|
Please generate a complete, self-contained Python script that solves the following problem. |
|
- Wrap the entire script in a Markdown code block with syntax highlighting (```python ... ```). |
|
- For each function, include a concise docstring enclosed in triple single quotes (''' ... '''), placed immediately below the def line. |
|
The docstring should briefly describe: |
|
โข The function's purpose |
|
โข Input parameters |
|
โข Return value |
|
|
|
Problem statement: |
|
{problem_prompt} |
|
""" |
|
f.write(full_prompt.strip()) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("ENTRY POINT:\n") |
|
f.write("="*80 + "\n") |
|
f.write(problem_data.get('entry_point', 'No entry point')) |
|
if 'canonical_solution' in problem_data: |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("CANONICAL SOLUTION:\n") |
|
f.write("="*80 + "\n") |
|
f.write(problem_data.get('canonical_solution', '')) |
|
|
|
|
|
llm_solution_file = os.path.join(initial_solution_dir, f"{problem_id_safe}_llm_solution.txt") |
|
with open(llm_solution_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Problem ID: {problem_id}\n") |
|
f.write(f"Benchmark: {benchmark}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n") |
|
f.write("LLM GENERATED SOLUTION:\n") |
|
f.write("="*80 + "\n") |
|
f.write(llm_step.get('solution', 'No solution generated')) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("SYNTAX VALIDATION:\n") |
|
f.write("="*80 + "\n") |
|
syntax_valid = llm_step.get('syntax_valid', False) |
|
f.write(f"Valid: {'โ
YES' if syntax_valid else 'โ NO'}") |
|
if llm_step.get('syntax_error'): |
|
f.write(f"\nError: {llm_step['syntax_error']}") |
|
|
|
|
|
f.write("\n" + "="*80 + "\n") |
|
f.write("SOLUTION CORRECTNESS EVALUATION:\n") |
|
f.write("="*80 + "\n") |
|
|
|
solution_eval = llm_step.get('solution_evaluation') |
|
if solution_eval: |
|
if solution_eval['correct']: |
|
f.write(f"Result: โ
CORRECT ({solution_eval['passed_tests']}/{solution_eval['total_tests']} tests passed)\n") |
|
else: |
|
f.write(f"Result: โ INCORRECT ({solution_eval['passed_tests']}/{solution_eval['total_tests']} tests passed)\n") |
|
|
|
if solution_eval.get('error'): |
|
f.write(f"Error: {solution_eval['error']}\n") |
|
else: |
|
f.write("No evaluation performed (syntax error or evaluation failed)\n") |
|
|
|
|
|
def save_current_evaluation_details(result, base_dir, timestamp): |
|
"""ํ์ฌ ์ฑ๋ฅ ํ๊ฐ ์์ธ ์ ๋ณด ์ ์ฅ - ๊ฐ ์๋๋ณ ๊ฐ๋ณ ํ์ผ ์์ฑ""" |
|
|
|
if 'baseline_evaluation' in result['steps']: |
|
baseline_step = result['steps']['baseline_evaluation'] |
|
|
|
|
|
current_dir = os.path.join(base_dir, 'current_evaluation') |
|
os.makedirs(current_dir, exist_ok=True) |
|
|
|
|
|
problem_data = result['steps'].get('problem_loading', {}).get('problem', {}) |
|
problem_id = result['problem_id'] |
|
benchmark = result.get('benchmark', 'unknown') |
|
|
|
|
|
solutions = baseline_step.get('solutions', []) |
|
for solution_result in solutions: |
|
round_id = solution_result.get('round_id', 0) |
|
attempt_file = os.path.join(current_dir, f'attempt_{round_id + 1}.txt') |
|
|
|
with open(attempt_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Current Evaluation - Attempt {round_id + 1}\n") |
|
f.write(f"Problem ID: {problem_id}\n") |
|
f.write(f"Benchmark: {benchmark}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n\n") |
|
|
|
|
|
f.write("1. ORIGINAL PROBLEM:\n") |
|
f.write("="*80 + "\n") |
|
f.write(problem_data.get('prompt', 'No prompt available')) |
|
f.write("\n" + "="*80 + "\n\n") |
|
|
|
|
|
f.write("2. LLM INPUT SCRIPT (PROMPT):\n") |
|
f.write("="*80 + "\n") |
|
problem_prompt = problem_data.get('prompt', '') |
|
|
|
|
|
if 'HumanEval' in problem_id: |
|
full_prompt = get_prompt("solution_humaneval_basic", |
|
problem_prompt=problem_prompt) |
|
else: |
|
full_prompt = get_prompt("solution_mbpp_basic", |
|
problem_prompt=problem_prompt) |
|
f.write(full_prompt.strip()) |
|
f.write("\n" + "="*80 + "\n\n") |
|
|
|
|
|
f.write("3. LLM RESPONSE:\n") |
|
f.write("="*80 + "\n") |
|
f.write(solution_result.get('solution', 'No solution generated')) |
|
f.write("\n" + "="*80 + "\n\n") |
|
|
|
|
|
f.write("4. CORRECTNESS EVALUATION:\n") |
|
f.write("="*80 + "\n") |
|
|
|
|
|
f.write(f"Syntax Valid: {'โ
YES' if solution_result.get('syntax_valid', False) else 'โ NO'}\n") |
|
if solution_result.get('syntax_error'): |
|
f.write(f"Syntax Error: {solution_result['syntax_error']}\n") |
|
|
|
|
|
evaluation = solution_result.get('evaluation') |
|
if evaluation: |
|
if evaluation.get('correct', False): |
|
f.write(f"Result: โ
CORRECT ({evaluation.get('passed_tests', 0)}/{evaluation.get('total_tests', 0)} tests passed)\n") |
|
else: |
|
f.write(f"Result: โ INCORRECT ({evaluation.get('passed_tests', 0)}/{evaluation.get('total_tests', 0)} tests passed)\n") |
|
|
|
if evaluation.get('error'): |
|
f.write(f"Evaluation Error: {evaluation['error']}\n") |
|
else: |
|
f.write("Result: โ NO EVALUATION (syntax error or evaluation failed)\n") |
|
|
|
f.write("="*80 + "\n") |
|
|
|
|
|
summary_file = os.path.join(current_dir, 'summary.txt') |
|
with open(summary_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Current Evaluation Summary\n") |
|
f.write(f"Problem ID: {result['problem_id']}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n\n") |
|
|
|
|
|
f.write("OVERALL STATISTICS:\n") |
|
f.write("="*80 + "\n") |
|
f.write(f"Total Attempts: {baseline_step.get('total_rounds', 0)}\n") |
|
f.write(f"Successful Attempts: {baseline_step.get('success_count', 0)}\n") |
|
f.write(f"Success Rate: {baseline_step.get('average_accuracy', 0.0):.3f}\n") |
|
f.write(f"Evaluation Status: {'โ
SUCCESS' if baseline_step.get('success', False) else 'โ FAILED'}\n") |
|
|
|
if baseline_step.get('error'): |
|
f.write(f"Error: {baseline_step['error']}\n") |
|
|
|
f.write("\n") |
|
f.write("Individual attempt files: attempt_1.txt, attempt_2.txt, attempt_3.txt, attempt_4.txt, attempt_5.txt\n") |
|
|
|
def save_diverse_programs_details(result, base_dir, timestamp): |
|
"""๋ค์ํ ํ๋ก๊ทธ๋จ ์์ฑ ์์ธ ์ ๋ณด ์ ์ฅ""" |
|
|
|
if 'diverse_programs' in result['steps']: |
|
diverse_step = result['steps']['diverse_programs'] |
|
|
|
|
|
diverse_dir = os.path.join(base_dir, 'diverse_programs') |
|
os.makedirs(diverse_dir, exist_ok=True) |
|
|
|
|
|
summary_file = os.path.join(diverse_dir, 'diverse_summary.txt') |
|
with open(summary_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Diverse Programs Generation\n") |
|
f.write(f"Problem ID: {result['problem_id']}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n\n") |
|
|
|
|
|
f.write("DIVERSE PROGRAMS STATISTICS:\n") |
|
f.write("="*80 + "\n") |
|
f.write(f"Total Programs: {diverse_step.get('total_programs', 0)}\n") |
|
f.write(f"Valid Programs: {diverse_step.get('valid_programs', 0)}\n") |
|
f.write(f"Total IPO Triples: {diverse_step.get('total_ipo_triples', 0)}\n") |
|
f.write(f"Generation Status: {'โ
SUCCESS' if diverse_step.get('success', False) else 'โ FAILED'}\n") |
|
|
|
if diverse_step.get('error'): |
|
f.write(f"Error: {diverse_step['error']}\n") |
|
|
|
f.write("\n\n") |
|
|
|
|
|
f.write("PROGRAM-BY-PROGRAM RESULTS:\n") |
|
f.write("="*80 + "\n") |
|
|
|
programs = diverse_step.get('programs', []) |
|
for program_result in programs: |
|
variation_id = program_result.get('variation_id', 0) |
|
f.write(f"\nProgram {variation_id + 1}:\n") |
|
f.write(f" Syntax Valid: {'โ
' if program_result.get('syntax_valid', False) else 'โ'}\n") |
|
|
|
if program_result.get('syntax_error'): |
|
f.write(f" Syntax Error: {program_result['syntax_error']}\n") |
|
|
|
f.write(f" IPO Triples: {program_result.get('num_ipo_triples', 0)}\n") |
|
f.write(f" Generated Inputs: {program_result.get('num_generated_inputs', 0)}\n") |
|
|
|
|
|
programs = diverse_step.get('programs', []) |
|
for program_result in programs: |
|
variation_id = program_result.get('variation_id', 0) |
|
|
|
|
|
program_dir = os.path.join(diverse_dir, f'program_{variation_id + 1}') |
|
os.makedirs(program_dir, exist_ok=True) |
|
|
|
|
|
detail_file = os.path.join(program_dir, 'generation_details.txt') |
|
with open(detail_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Diverse Program {variation_id + 1} - Generation Details\n") |
|
f.write(f"Problem ID: {result['problem_id']}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n\n") |
|
|
|
|
|
problem_data = result['steps'].get('problem_loading', {}).get('problem', {}) |
|
f.write("1. ORIGINAL PROBLEM:\n") |
|
f.write("="*80 + "\n") |
|
f.write(problem_data.get('prompt', 'No prompt available')) |
|
f.write("\n" + "="*80 + "\n\n") |
|
|
|
|
|
f.write("2. DIVERSITY PROMPT USED:\n") |
|
f.write("="*80 + "\n") |
|
|
|
|
|
diversity_instruction = get_diversity_instruction(variation_id) |
|
problem_prompt = problem_data.get('prompt', '') |
|
problem_id = result['problem_id'] |
|
|
|
|
|
if 'HumanEval' in problem_id: |
|
full_prompt = get_prompt("diverse_humaneval_basic", |
|
diversity_instruction=diversity_instruction, |
|
problem_prompt=problem_prompt) |
|
else: |
|
full_prompt = get_prompt("diverse_mbpp_basic", |
|
diversity_instruction=diversity_instruction, |
|
problem_prompt=problem_prompt) |
|
f.write(full_prompt.strip()) |
|
f.write("\n" + "="*80 + "\n\n") |
|
|
|
|
|
f.write("3. LLM RESPONSE:\n") |
|
f.write("="*80 + "\n") |
|
f.write(program_result.get('solution', 'No solution generated')) |
|
f.write("\n" + "="*80 + "\n\n") |
|
|
|
|
|
f.write("4. EVALUATION RESULTS:\n") |
|
f.write("="*80 + "\n") |
|
f.write(f"Syntax Valid: {'โ
YES' if program_result.get('syntax_valid', False) else 'โ NO'}\n") |
|
if program_result.get('syntax_error'): |
|
f.write(f"Syntax Error: {program_result['syntax_error']}\n") |
|
f.write(f"IPO Triples Generated: {program_result.get('num_ipo_triples', 0)}\n") |
|
f.write(f"Input Generation: {program_result.get('num_generated_inputs', 0)} new inputs\n") |
|
f.write("="*80 + "\n") |
|
|
|
|
|
solution_file = os.path.join(program_dir, 'solution.py') |
|
with open(solution_file, 'w', encoding='utf-8') as f: |
|
f.write(f"# Diverse Program {variation_id + 1}\n") |
|
f.write(f"# Problem ID: {result['problem_id']}\n") |
|
f.write(f"# Generated: {timestamp}\n") |
|
f.write(f"# Syntax Valid: {program_result.get('syntax_valid', False)}\n") |
|
f.write(f"# IPO Triples: {program_result.get('num_ipo_triples', 0)}\n") |
|
f.write("\n") |
|
f.write(program_result.get('solution', '# No solution available')) |
|
|
|
|
|
ipo_triples = program_result.get('ipo_triples', []) |
|
if ipo_triples: |
|
ipo_dir = os.path.join(program_dir, 'ipo_triples') |
|
os.makedirs(ipo_dir, exist_ok=True) |
|
|
|
for i, triple in enumerate(ipo_triples): |
|
triple_file = os.path.join(ipo_dir, f'triple_{i + 1}.json') |
|
with open(triple_file, 'w', encoding='utf-8') as f: |
|
json.dump(triple, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
input_gen_info = program_result.get('input_generation_info') |
|
if input_gen_info is not None: |
|
input_gen_file = os.path.join(program_dir, 'input_generation_details.txt') |
|
with open(input_gen_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Input Generation Details - Program {variation_id + 1}\n") |
|
f.write(f"Problem ID: {result['problem_id']}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n\n") |
|
|
|
f.write("1. FUNCTION INFO:\n") |
|
f.write("="*80 + "\n") |
|
func_info = input_gen_info.get('function_info', {}) |
|
f.write(f"Function Name: {func_info.get('name', 'N/A')}\n") |
|
f.write(f"Parameters: {func_info.get('params', 'N/A')}\n") |
|
f.write(f"Parameters String: {func_info.get('params_str', 'N/A')}\n\n") |
|
|
|
f.write("2. ARGUMENT TYPE INFO:\n") |
|
f.write("="*80 + "\n") |
|
f.write(input_gen_info.get('arg_type_info', 'N/A') + "\n\n") |
|
|
|
f.write("3. EXISTING EXAMPLES:\n") |
|
f.write("="*80 + "\n") |
|
for i, (inp, out) in enumerate(input_gen_info.get('existing_examples', [])): |
|
f.write(f"Example {i+1}: Input: {inp} โ Output: {out}\n") |
|
f.write("\n") |
|
|
|
f.write("4. LLM PROMPT:\n") |
|
f.write("="*80 + "\n") |
|
f.write(input_gen_info.get('prompt', 'N/A') + "\n") |
|
f.write("="*80 + "\n\n") |
|
|
|
f.write("5. LLM RESPONSE:\n") |
|
f.write("="*80 + "\n") |
|
f.write(input_gen_info.get('llm_response', 'N/A') + "\n") |
|
f.write("="*80 + "\n\n") |
|
|
|
f.write("6. EXTRACTED INPUTS:\n") |
|
f.write("="*80 + "\n") |
|
extracted = input_gen_info.get('extracted_inputs', []) |
|
if extracted: |
|
for i, inp_data in enumerate(extracted): |
|
f.write(f"Input {i+1}: {inp_data}\n") |
|
else: |
|
f.write("No inputs extracted\n") |
|
|
|
|
|
if 'error' in input_gen_info: |
|
f.write("\n7. ERROR:\n") |
|
f.write("="*80 + "\n") |
|
f.write(input_gen_info['error'] + "\n") |
|
|
|
def save_input_generation_details(result, base_dir, timestamp): |
|
"""์
๋ ฅ ์์ฑ ๊ด๋ จ ์์ธ ์ ๋ณด ์ ์ฅ""" |
|
|
|
if 'ipo_extraction' in result['steps']: |
|
ipo_step = result['steps']['ipo_extraction'] |
|
num_generated = ipo_step.get('num_generated', 0) |
|
generated_inputs = ipo_step.get('generated_inputs', []) |
|
|
|
generation_prompt = ipo_step.get('generation_prompt', '') |
|
input_generation_attempted = bool(generation_prompt) or len(generated_inputs) > 0 |
|
|
|
|
|
if 'ipo_extraction' in result['steps']: |
|
|
|
input_gen_dir = os.path.join(base_dir, 'input_generation') |
|
os.makedirs(input_gen_dir, exist_ok=True) |
|
|
|
|
|
details_file = os.path.join(input_gen_dir, 'generation_details.txt') |
|
with open(details_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Input Generation Details\n") |
|
f.write(f"Problem ID: {result['problem_id']}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n\n") |
|
|
|
|
|
f.write("GENERATION STATISTICS:\n") |
|
f.write("="*80 + "\n") |
|
f.write(f"Original IPO triples: {ipo_step.get('num_original', 0)}\n") |
|
f.write(f"Generated inputs: {ipo_step.get('num_generated', 0)}\n") |
|
f.write(f"Total IPO triples: {ipo_step.get('num_triples', 0)}\n") |
|
f.write(f"Input generation attempted: {input_generation_attempted}\n") |
|
|
|
|
|
if not input_generation_attempted: |
|
f.write(f"FAILURE REASON: Input generation was not attempted\n") |
|
elif num_generated == 0: |
|
f.write(f"FAILURE REASON: LLM response could not be parsed or contained no valid inputs\n") |
|
|
|
|
|
f.write("\n\n" + "="*80 + "\n") |
|
f.write("LLM INPUT GENERATION PROMPT:\n") |
|
f.write("="*80 + "\n") |
|
f.write(ipo_step.get('generation_prompt', 'No prompt available')) |
|
|
|
|
|
f.write("\n\n" + "="*80 + "\n") |
|
f.write("LLM RESPONSE:\n") |
|
f.write("="*80 + "\n") |
|
f.write(ipo_step.get('generation_response', 'No response available')) |
|
|
|
|
|
f.write("\n\n" + "="*80 + "\n") |
|
f.write("EXTRACTED AND VALIDATED INPUTS:\n") |
|
f.write("="*80 + "\n") |
|
generated_inputs = ipo_step.get('generated_inputs', []) |
|
if generated_inputs: |
|
for i, inp in enumerate(generated_inputs): |
|
f.write(f"\nInput {i+1}:\n") |
|
f.write(f"{inp}\n") |
|
else: |
|
f.write("No valid inputs were extracted.\n") |
|
|
|
|
|
def save_detailed_results(result, output_dir, timestamp): |
|
"""์์ธํ ๊ฒฐ๊ณผ๋ฅผ ๊ฐ๋ณ ํ์ผ๋ก ์ ์ฅ (test_complete_pipeline.py ์คํ์ผ)""" |
|
|
|
|
|
benchmark = result.get('benchmark', 'unknown') |
|
problem_id = result['problem_id'] |
|
problem_id_safe = problem_id.replace('/', '_') |
|
|
|
|
|
base_dir = os.path.join(output_dir, benchmark, problem_id_safe) |
|
os.makedirs(base_dir, exist_ok=True) |
|
|
|
|
|
if 'llm_generation' in result['steps']: |
|
llm_step = result['steps']['llm_generation'] |
|
|
|
initial_solution_dir = os.path.join(base_dir, 'initial_solution') |
|
os.makedirs(initial_solution_dir, exist_ok=True) |
|
|
|
|
|
if 'problem_loading' in result['steps']: |
|
problem_data = result['steps']['problem_loading'].get('problem', {}) |
|
problem_file = os.path.join(initial_solution_dir, f"{problem_id_safe}_original_problem.txt") |
|
with open(problem_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Problem ID: {result['problem_id']}\n") |
|
f.write(f"Benchmark: {result['benchmark']}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n") |
|
f.write("ORIGINAL BENCHMARK PROBLEM:\n") |
|
f.write("="*80 + "\n") |
|
f.write(problem_data.get('prompt', 'No prompt available')) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("FULL LLM PROMPT:\n") |
|
f.write("="*80 + "\n") |
|
|
|
problem_prompt = problem_data.get('prompt', '') |
|
|
|
|
|
if 'HumanEval' in problem_id: |
|
full_prompt = f"""You are a Python writing assistant. Complete the following Python function. |
|
|
|
{problem_prompt} |
|
|
|
Please provide a complete implementation of the function.""" |
|
else: |
|
|
|
full_prompt = f""" |
|
Please generate a complete, self-contained Python script that solves the following problem. |
|
- Wrap the entire script in a Markdown code block with syntax highlighting (```python ... ```). |
|
- For each function, include a concise docstring enclosed in triple single quotes (''' ... '''), placed immediately below the def line. |
|
The docstring should briefly describe: |
|
โข The function's purpose |
|
โข Input parameters |
|
โข Return value |
|
|
|
Problem statement: |
|
{problem_prompt} |
|
""" |
|
f.write(full_prompt.strip()) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("ENTRY POINT:\n") |
|
f.write("="*80 + "\n") |
|
f.write(problem_data.get('entry_point', 'No entry point')) |
|
if 'canonical_solution' in problem_data: |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("CANONICAL SOLUTION:\n") |
|
f.write("="*80 + "\n") |
|
f.write(problem_data.get('canonical_solution', '')) |
|
if 'test' in problem_data: |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("TEST CASES:\n") |
|
f.write("="*80 + "\n") |
|
f.write(str(problem_data.get('test', ''))) |
|
|
|
|
|
llm_solution_file = os.path.join(initial_solution_dir, f"{problem_id_safe}_llm_solution.txt") |
|
with open(llm_solution_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Problem ID: {result['problem_id']}\n") |
|
f.write(f"Benchmark: {result['benchmark']}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n") |
|
f.write("LLM GENERATED SOLUTION:\n") |
|
f.write("="*80 + "\n") |
|
f.write(llm_step.get('solution', 'No solution generated')) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("SYNTAX VALIDATION:\n") |
|
f.write("="*80 + "\n") |
|
syntax_valid = llm_step.get('syntax_valid', False) |
|
f.write(f"Valid: {'โ
YES' if syntax_valid else 'โ NO'}") |
|
if llm_step.get('syntax_error'): |
|
f.write(f"\nError: {llm_step['syntax_error']}") |
|
|
|
|
|
f.write("\n" + "="*80 + "\n") |
|
f.write("SOLUTION CORRECTNESS EVALUATION:\n") |
|
f.write("="*80 + "\n") |
|
|
|
solution_eval = llm_step.get('solution_evaluation') |
|
if solution_eval: |
|
if solution_eval['correct']: |
|
f.write(f"Result: โ
CORRECT ({solution_eval['passed_tests']}/{solution_eval['total_tests']} tests passed)\n") |
|
else: |
|
f.write(f"Result: โ INCORRECT ({solution_eval['passed_tests']}/{solution_eval['total_tests']} tests passed)\n") |
|
|
|
if solution_eval.get('error'): |
|
f.write(f"Error: {solution_eval['error']}\n") |
|
else: |
|
f.write("No evaluation performed (syntax error or no test cases)\n") |
|
|
|
|
|
if 'ipo_extraction' in result['steps']: |
|
ipo_step = result['steps']['ipo_extraction'] |
|
triples = ipo_step.get('triples', []) |
|
|
|
if triples: |
|
ipo_dir = os.path.join(base_dir, 'ipo_triples') |
|
os.makedirs(ipo_dir, exist_ok=True) |
|
|
|
for i, triple in enumerate(triples): |
|
triple_file = os.path.join(ipo_dir, f"{problem_id_safe}_triple_{i+1}.json") |
|
with open(triple_file, 'w', encoding='utf-8') as f: |
|
json.dump(triple, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
if 'task_generation' in result['steps']: |
|
task_step = result['steps']['task_generation'] |
|
all_tasks = task_step.get('all_tasks', {}) |
|
|
|
if all_tasks: |
|
task_dir = os.path.join(base_dir, 'task_prompts') |
|
os.makedirs(task_dir, exist_ok=True) |
|
|
|
for task_type, tasks in all_tasks.items(): |
|
for i, task in enumerate(tasks): |
|
task_file = os.path.join(task_dir, f"{problem_id_safe}_{task_type}_{i+1}.txt") |
|
with open(task_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Task Type: {task_type}\n") |
|
f.write(f"Task ID: {task.get('task_id', 'N/A')}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n") |
|
f.write("TASK PROMPT:\n") |
|
f.write("="*80 + "\n") |
|
f.write(task.get('prompt', 'No prompt available')) |
|
|
|
|
|
if 'task_evaluation' in result['steps']: |
|
eval_step = result['steps']['task_evaluation'] |
|
evaluations = eval_step.get('evaluations', {}) |
|
|
|
response_dir = os.path.join(base_dir, 'llm_responses') |
|
os.makedirs(response_dir, exist_ok=True) |
|
|
|
response_count = 0 |
|
for task_type, task_evals in evaluations.items(): |
|
for i, evaluation in enumerate(task_evals): |
|
response_file = os.path.join(response_dir, f"{problem_id_safe}_{task_type}_{i+1}_response.txt") |
|
with open(response_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Task Type: {task_type}\n") |
|
f.write(f"Task ID: {evaluation.get('task_id', 'N/A')}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n") |
|
f.write("ORIGINAL PROMPT:\n") |
|
f.write("="*80 + "\n") |
|
f.write(evaluation.get('prompt', 'No prompt available')) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("LLM RESPONSE:\n") |
|
f.write("="*80 + "\n") |
|
f.write(evaluation.get('llm_response', 'No response')) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("EXPECTED SOLUTION:\n") |
|
f.write("="*80 + "\n") |
|
f.write(evaluation.get('expected_solution', 'No expected solution')) |
|
|
|
|
|
if 'reward_computation' in result['steps']: |
|
reward_step = result['steps']['reward_computation'] |
|
rewards = reward_step.get('rewards', {}) |
|
rewards_by_type = rewards.get('rewards_by_type', {}) |
|
|
|
|
|
current_task_rewards = rewards_by_type.get(task_type, []) |
|
current_reward = None |
|
for reward in current_task_rewards: |
|
if reward.get('task_id') == evaluation.get('task_id'): |
|
current_reward = reward |
|
break |
|
|
|
if current_reward and 'extracted_answer' in current_reward: |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("EXTRACTED ANSWER:\n") |
|
f.write("="*80 + "\n") |
|
f.write(current_reward['extracted_answer']) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("MATCH RESULT:\n") |
|
f.write("="*80 + "\n") |
|
match_result = "โ
CORRECT" if current_reward.get('basic_accuracy', 0) > 0 else "โ INCORRECT" |
|
f.write(f"{match_result} (Score: {current_reward.get('basic_accuracy', 0):.3f})") |
|
|
|
response_count += 1 |
|
|
|
print(f"๐ LLM ์๋ต ์ ์ฅ: {response_dir}/ ({response_count}๊ฐ ํ์ผ)") |
|
|
|
|
|
save_input_generation_details(result, base_dir, timestamp) |
|
|
|
|
|
summary_file = os.path.join(base_dir, f"{problem_id_safe}_summary.json") |
|
with open(summary_file, 'w', encoding='utf-8') as f: |
|
summary = { |
|
'problem_id': result['problem_id'], |
|
'benchmark': result['benchmark'], |
|
'success': result['success'], |
|
'timestamp': timestamp, |
|
'initial_solution_correct': False, |
|
'ipo_extraction_success': False, |
|
'reasoning_task_results': {} |
|
} |
|
|
|
|
|
if 'llm_generation' in result['steps']: |
|
llm_step = result['steps']['llm_generation'] |
|
eval_result = llm_step.get('solution_evaluation') |
|
if eval_result: |
|
summary['initial_solution_correct'] = eval_result['correct'] |
|
|
|
|
|
if 'ipo_extraction' in result['steps']: |
|
ipo_step = result['steps']['ipo_extraction'] |
|
summary['ipo_extraction_success'] = ipo_step.get('success', False) |
|
|
|
|
|
if 'reward_computation' in result['steps']: |
|
reward_step = result['steps']['reward_computation'] |
|
rewards = reward_step.get('rewards', {}) |
|
for task_type, type_rewards in rewards.get('rewards_by_type', {}).items(): |
|
correct_count = sum(1 for r in type_rewards if r['basic_accuracy'] > 0) |
|
total_count = len(type_rewards) |
|
summary['reasoning_task_results'][task_type] = { |
|
'correct': correct_count, |
|
'total': total_count, |
|
'accuracy': correct_count / total_count if total_count > 0 else 0 |
|
} |
|
|
|
json.dump(summary, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
def run_batch_evaluation(args): |
|
"""๋ฒค์น๋งํฌ ์ ์ฒด์ ๋ํ ๋ฐฐ์น ํ๊ฐ ์คํ""" |
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
|
|
|
output_dir = os.path.join(args.output_dir, f"batch_evaluation_{timestamp}") |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
logger = TestTimeLogger(log_level='INFO') |
|
logger.log_info(f"๐ Starting batch TestTime RLVR evaluation") |
|
logger.log_info(f"๐ Model: {args.model}") |
|
logger.log_info(f"๐ฏ Benchmark: {args.benchmark}") |
|
logger.log_info(f"๐ Max problems: {args.max_problems}") |
|
logger.log_info(f"๐ Output: {output_dir}") |
|
|
|
|
|
config = TestTimeConfig( |
|
model_name=args.model, |
|
max_adaptation_steps=3, |
|
learning_rate=1e-5, |
|
task_distribution={'induction': 0.4, 'deduction': 0.3, 'abduction': 0.3}, |
|
adaptation_batch_size=1, |
|
max_tasks_per_type=3, |
|
use_flash_attention=False, |
|
torch_dtype='float16', |
|
enable_gradient_checkpointing=False |
|
) |
|
|
|
|
|
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
|
|
if args.benchmark == 'humaneval': |
|
benchmark_config = BenchmarkConfig.get_humaneval_config() |
|
benchmark_config.data_path = os.path.join(base_dir, 'evaluation/code_eval/data/HumanEvalPlus.jsonl') |
|
elif args.benchmark == 'mbpp': |
|
benchmark_config = BenchmarkConfig.get_mbpp_config() |
|
benchmark_config.data_path = os.path.join(base_dir, 'evaluation/code_eval/data/MbppPlus.jsonl') |
|
else: |
|
raise ValueError(f"Unsupported benchmark: {args.benchmark}") |
|
|
|
|
|
logger.log_info("๐ฆ Loading model and tokenizer...") |
|
try: |
|
model, tokenizer = InitialSolutionGenerator.load_model_with_optimizations( |
|
args.model, f'cuda:{args.gpu}', config, use_vllm=True |
|
) |
|
logger.log_info("โ
Model loaded successfully") |
|
except Exception as e: |
|
logger.log_error(f"โ Failed to load model: {e}") |
|
return False |
|
|
|
|
|
pipeline = CompleteTestTimePipeline(model, tokenizer, config, logger) |
|
|
|
|
|
logger.log_info("๐ Loading benchmark problems...") |
|
problems = load_benchmark_problems(benchmark_config) |
|
|
|
if not problems: |
|
logger.log_error("โ No problems found in benchmark") |
|
return False |
|
|
|
|
|
original_problem_count = len(problems) |
|
completed_problems = set() |
|
existing_results = None |
|
|
|
if args.resume or args.start_from: |
|
|
|
completed_problems = get_completed_problems(output_dir) |
|
|
|
if completed_problems: |
|
logger.log_info(f"๐ Resume mode: Found {len(completed_problems)} completed problems") |
|
|
|
|
|
existing_results_file = os.path.join(output_dir, "batch_evaluation_results.json") |
|
if os.path.exists(existing_results_file): |
|
with open(existing_results_file, 'r', encoding='utf-8') as f: |
|
existing_results = json.load(f) |
|
logger.log_info(f"๐ Loaded existing results from {existing_results_file}") |
|
|
|
|
|
problems = [p for p in problems if p not in completed_problems] |
|
logger.log_info(f"๐ After excluding completed: {len(problems)} problems remaining") |
|
|
|
|
|
if args.start_from: |
|
try: |
|
start_idx = problems.index(args.start_from) |
|
problems = problems[start_idx:] |
|
logger.log_info(f"๐ Starting from problem: {args.start_from} (index {start_idx})") |
|
except ValueError: |
|
logger.log_warning(f"โ ๏ธ Problem {args.start_from} not found, starting from beginning") |
|
|
|
|
|
if args.max_problems > 0: |
|
problems = problems[:args.max_problems] |
|
|
|
if not problems: |
|
logger.log_info("๐ All problems already completed!") |
|
return True |
|
|
|
logger.log_info(f"๐ Processing {len(problems)} problems (Total in benchmark: {original_problem_count})") |
|
|
|
|
|
if existing_results: |
|
|
|
results = { |
|
'config': existing_results['config'].copy(), |
|
'initial_solution_stats': { |
|
**existing_results['initial_solution_stats'].copy(), |
|
'first_attempt_correct': existing_results['initial_solution_stats'].get('first_attempt_correct', 0), |
|
'at_least_once_correct': existing_results['initial_solution_stats'].get('at_least_once_correct', 0), |
|
'total_attempts': existing_results['initial_solution_stats'].get('total_attempts', 0), |
|
'total_successes': existing_results['initial_solution_stats'].get('total_successes', 0), |
|
'first_attempt_failed_problem_ids': existing_results['initial_solution_stats'].get('first_attempt_failed_problem_ids', []), |
|
'never_success_problem_ids': existing_results['initial_solution_stats'].get('never_success_problem_ids', []) |
|
}, |
|
'reasoning_task_stats': { |
|
task_type: { |
|
**stats, |
|
'total_accuracy': stats.get('total_accuracy', 0.0) |
|
} |
|
for task_type, stats in existing_results['reasoning_task_stats'].items() |
|
}, |
|
'ipo_extraction_stats': existing_results['ipo_extraction_stats'].copy(), |
|
'input_generation_stats': existing_results.get('input_generation_stats', { |
|
'total_attempts': 0, |
|
'successful': 0, |
|
'failed': 0, |
|
'total_generated_inputs': 0, |
|
'average_inputs_per_problem': 0.0, |
|
'problems_with_generation': [] |
|
}).copy(), |
|
'current_evaluation_stats': existing_results.get('current_evaluation_stats', existing_results.get('baseline_evaluation_stats', { |
|
'total_attempts': 0, |
|
'successful': 0, |
|
'failed': 0, |
|
'total_rounds': 0, |
|
'total_success_rounds': 0, |
|
'average_success_rate': 0.0, |
|
'failed_problem_ids': [] |
|
})).copy(), |
|
'diverse_programs_stats': existing_results.get('diverse_programs_stats', { |
|
'total_attempts': 0, |
|
'successful': 0, |
|
'failed': 0, |
|
'total_programs_generated': 0, |
|
'total_valid_programs': 0, |
|
'total_ipo_triples': 0, |
|
'average_programs_per_problem': 0.0, |
|
'average_ipo_per_problem': 0.0, |
|
'failed_problem_ids': [] |
|
}).copy(), |
|
'timing_stats': existing_results['timing_stats'].copy(), |
|
'problem_results': existing_results['problem_results'].copy() |
|
} |
|
results['config']['resumed'] = True |
|
results['config']['resumed_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') |
|
results['config']['remaining_problems'] = len(problems) |
|
else: |
|
|
|
results = { |
|
'config': { |
|
'model': args.model, |
|
'benchmark': args.benchmark, |
|
'timestamp': timestamp, |
|
'total_problems': original_problem_count, |
|
'processing_problems': len(problems) |
|
}, |
|
'initial_solution_stats': { |
|
'total': 0, |
|
'first_attempt_correct': 0, |
|
'at_least_once_correct': 0, |
|
'total_attempts': 0, |
|
'total_successes': 0, |
|
'first_attempt_failed_problem_ids': [], |
|
'never_success_problem_ids': [], |
|
'syntax_errors': 0, |
|
'evaluation_errors': 0, |
|
'correct': 0, |
|
'failed_problem_ids': [] |
|
}, |
|
'reasoning_task_stats': { |
|
'induction': { |
|
'total': 0, |
|
'correct': 0, |
|
'accuracy_0_count': 0, |
|
'accuracy_1_count': 0, |
|
'total_accuracy': 0.0 |
|
}, |
|
'deduction': { |
|
'total': 0, |
|
'correct': 0, |
|
'accuracy_0_count': 0, |
|
'accuracy_1_count': 0, |
|
'total_accuracy': 0.0 |
|
}, |
|
'abduction': { |
|
'total': 0, |
|
'correct': 0, |
|
'accuracy_0_count': 0, |
|
'accuracy_1_count': 0, |
|
'total_accuracy': 0.0 |
|
} |
|
}, |
|
'timing_stats': { |
|
'total_time_seconds': 0, |
|
'average_time_per_problem': 0, |
|
'problem_times': [] |
|
}, |
|
'ipo_extraction_stats': { |
|
'total_attempts': 0, |
|
'successful': 0, |
|
'failed': 0, |
|
'failed_problem_ids': [] |
|
}, |
|
'input_generation_stats': { |
|
'total_attempts': 0, |
|
'successful': 0, |
|
'failed': 0, |
|
'total_generated_inputs': 0, |
|
'average_inputs_per_problem': 0.0, |
|
'problems_with_generation': [] |
|
}, |
|
'current_evaluation_stats': { |
|
'total_attempts': 0, |
|
'successful': 0, |
|
'failed': 0, |
|
'total_rounds': 0, |
|
'total_success_rounds': 0, |
|
'average_success_rate': 0.0, |
|
'failed_problem_ids': [] |
|
}, |
|
'diverse_programs_stats': { |
|
'total_attempts': 0, |
|
'successful': 0, |
|
'failed': 0, |
|
'total_programs_generated': 0, |
|
'total_valid_programs': 0, |
|
'total_ipo_triples': 0, |
|
'average_programs_per_problem': 0.0, |
|
'average_ipo_per_problem': 0.0, |
|
'failed_problem_ids': [] |
|
}, |
|
'problem_results': [] |
|
} |
|
|
|
|
|
start_total_time = time.time() |
|
|
|
for i, problem_id in enumerate(problems): |
|
logger.log_info(f"๐ [{i+1}/{len(problems)}] Processing {problem_id}") |
|
|
|
|
|
problem_start_time = time.time() |
|
|
|
|
|
step_results = { |
|
'problem_loading': False, |
|
'llm_generation': False, |
|
'solution_evaluation': False, |
|
'ipo_extraction': False, |
|
'input_generation': False, |
|
'task_generation': False, |
|
'task_evaluation': False |
|
} |
|
|
|
try: |
|
|
|
result = pipeline.run_complete_pipeline(benchmark_config, problem_id) |
|
|
|
|
|
problem_end_time = time.time() |
|
problem_duration = problem_end_time - problem_start_time |
|
|
|
|
|
if 'steps' in result: |
|
step_results['problem_loading'] = result.get('success', False) |
|
|
|
|
|
if 'baseline_evaluation' in result['steps']: |
|
baseline_eval = result['steps']['baseline_evaluation'] |
|
step_results['llm_generation'] = baseline_eval.get('success', False) |
|
step_results['solution_evaluation'] = baseline_eval.get('success_count', 0) > 0 |
|
|
|
|
|
if 'diverse_programs' in result['steps']: |
|
diverse_progs = result['steps']['diverse_programs'] |
|
step_results['ipo_extraction'] = diverse_progs.get('total_ipo_triples', 0) > 0 |
|
|
|
if 'diverse_programs' in result['steps']: |
|
diverse_progs = result['steps']['diverse_programs'] |
|
total_generated = sum(p.get('num_generated_inputs', 0) for p in diverse_progs.get('programs', [])) |
|
step_results['input_generation'] = total_generated > 0 |
|
|
|
|
|
if 'task_generation' in result['steps']: |
|
task_gen = result['steps']['task_generation'] |
|
step_results['task_generation'] = task_gen.get('total_tasks', 0) > 0 |
|
|
|
if 'task_evaluation' in result['steps']: |
|
task_eval = result['steps']['task_evaluation'] |
|
step_results['task_evaluation'] = task_eval.get('total_evaluated', 0) > 0 |
|
|
|
|
|
logger.log_info(f" ๐ Problem Loading: {'โ
' if step_results['problem_loading'] else 'โ'}") |
|
logger.log_info(f" ๐ค LLM Generation: {'โ
' if step_results['llm_generation'] else 'โ'}") |
|
logger.log_info(f" ๐ Solution Evaluation: {'โ
' if step_results['solution_evaluation'] else 'โ'}") |
|
logger.log_info(f" ๐ IPO Extraction: {'โ
' if step_results['ipo_extraction'] else 'โ'}") |
|
logger.log_info(f" ๐ฒ Input Generation: {'โ
' if step_results['input_generation'] else 'โ'}") |
|
logger.log_info(f" ๐ Task Generation: {'โ
' if step_results['task_generation'] else 'โ'}") |
|
logger.log_info(f" ๐ง Task Evaluation: {'โ
' if step_results['task_evaluation'] else 'โ'}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if result['success']: |
|
try: |
|
save_detailed_results(result, output_dir, timestamp) |
|
|
|
|
|
base_dir = os.path.join(output_dir, result.get('benchmark', 'unknown'), problem_id.replace('/', '_')) |
|
save_current_evaluation_details(result, base_dir, timestamp) |
|
save_diverse_programs_details(result, base_dir, timestamp) |
|
|
|
logger.log_info(f" ๐ Complete results saved for {problem_id}") |
|
except Exception as e: |
|
logger.log_warning(f" โ ๏ธ Failed to save complete results: {e}") |
|
|
|
|
|
results['initial_solution_stats']['total'] += 1 |
|
initial_solution_correct = False |
|
|
|
|
|
results['ipo_extraction_stats']['total_attempts'] += 1 |
|
|
|
if result['success']: |
|
|
|
baseline_eval = result['steps'].get('baseline_evaluation', {}) |
|
attempts = baseline_eval.get('solutions', []) |
|
|
|
if attempts: |
|
|
|
results['initial_solution_stats']['total_attempts'] += len(attempts) |
|
successes = sum(1 for attempt in attempts if attempt.get('evaluation', {}).get('correct', False)) |
|
results['initial_solution_stats']['total_successes'] += successes |
|
|
|
|
|
first_attempt_correct = attempts[0].get('evaluation', {}).get('correct', False) |
|
if first_attempt_correct: |
|
results['initial_solution_stats']['first_attempt_correct'] += 1 |
|
else: |
|
|
|
if problem_id not in results['initial_solution_stats']['first_attempt_failed_problem_ids']: |
|
results['initial_solution_stats']['first_attempt_failed_problem_ids'].append(problem_id) |
|
|
|
|
|
at_least_once_success = any(attempt.get('evaluation', {}).get('correct', False) for attempt in attempts) |
|
if at_least_once_success: |
|
results['initial_solution_stats']['at_least_once_correct'] += 1 |
|
results['initial_solution_stats']['correct'] += 1 |
|
initial_solution_correct = True |
|
else: |
|
|
|
if problem_id not in results['initial_solution_stats']['never_success_problem_ids']: |
|
results['initial_solution_stats']['never_success_problem_ids'].append(problem_id) |
|
if problem_id not in results['initial_solution_stats']['failed_problem_ids']: |
|
results['initial_solution_stats']['failed_problem_ids'].append(problem_id) |
|
|
|
|
|
first_attempt = attempts[0] |
|
if not first_attempt.get('syntax_valid', True): |
|
results['initial_solution_stats']['syntax_errors'] += 1 |
|
if first_attempt.get('evaluation_error'): |
|
results['initial_solution_stats']['evaluation_errors'] += 1 |
|
else: |
|
|
|
llm_gen = result['steps'].get('llm_generation', {}) |
|
eval_result = llm_gen.get('solution_evaluation') |
|
|
|
if eval_result: |
|
if eval_result['correct']: |
|
results['initial_solution_stats']['first_attempt_correct'] += 1 |
|
results['initial_solution_stats']['at_least_once_correct'] += 1 |
|
results['initial_solution_stats']['correct'] += 1 |
|
initial_solution_correct = True |
|
else: |
|
|
|
if problem_id not in results['initial_solution_stats']['first_attempt_failed_problem_ids']: |
|
results['initial_solution_stats']['first_attempt_failed_problem_ids'].append(problem_id) |
|
if problem_id not in results['initial_solution_stats']['never_success_problem_ids']: |
|
results['initial_solution_stats']['never_success_problem_ids'].append(problem_id) |
|
if problem_id not in results['initial_solution_stats']['failed_problem_ids']: |
|
results['initial_solution_stats']['failed_problem_ids'].append(problem_id) |
|
|
|
if eval_result.get('error'): |
|
results['initial_solution_stats']['evaluation_errors'] += 1 |
|
|
|
if not llm_gen.get('syntax_valid', True): |
|
results['initial_solution_stats']['syntax_errors'] += 1 |
|
|
|
|
|
ipo_step = result['steps'].get('ipo_extraction', {}) |
|
if ipo_step.get('success', False) and ipo_step.get('triples'): |
|
results['ipo_extraction_stats']['successful'] += 1 |
|
else: |
|
results['ipo_extraction_stats']['failed'] += 1 |
|
if problem_id not in results['ipo_extraction_stats']['failed_problem_ids']: |
|
results['ipo_extraction_stats']['failed_problem_ids'].append(problem_id) |
|
logger.log_info(f" โ ๏ธ IPO extraction failed for {problem_id}") |
|
|
|
|
|
if ipo_step.get('success', False): |
|
results['input_generation_stats']['total_attempts'] += 1 |
|
|
|
if ipo_step.get('num_generated', 0) > 0: |
|
results['input_generation_stats']['successful'] += 1 |
|
results['input_generation_stats']['total_generated_inputs'] += ipo_step['num_generated'] |
|
if problem_id not in results['input_generation_stats']['problems_with_generation']: |
|
results['input_generation_stats']['problems_with_generation'].append(problem_id) |
|
else: |
|
results['input_generation_stats']['failed'] += 1 |
|
|
|
|
|
baseline_step = result['steps'].get('baseline_evaluation', {}) |
|
if baseline_step: |
|
results['current_evaluation_stats']['total_attempts'] += 1 |
|
|
|
if baseline_step.get('success', False): |
|
results['current_evaluation_stats']['successful'] += 1 |
|
results['current_evaluation_stats']['total_rounds'] += baseline_step.get('total_rounds', 0) |
|
results['current_evaluation_stats']['total_success_rounds'] += baseline_step.get('success_count', 0) |
|
else: |
|
results['current_evaluation_stats']['failed'] += 1 |
|
if problem_id not in results['current_evaluation_stats']['failed_problem_ids']: |
|
results['current_evaluation_stats']['failed_problem_ids'].append(problem_id) |
|
|
|
|
|
diverse_step = result['steps'].get('diverse_programs', {}) |
|
if diverse_step: |
|
results['diverse_programs_stats']['total_attempts'] += 1 |
|
|
|
if diverse_step.get('success', False): |
|
results['diverse_programs_stats']['successful'] += 1 |
|
results['diverse_programs_stats']['total_programs_generated'] += diverse_step.get('total_programs', 0) |
|
results['diverse_programs_stats']['total_valid_programs'] += diverse_step.get('valid_programs', 0) |
|
results['diverse_programs_stats']['total_ipo_triples'] += diverse_step.get('total_ipo_triples', 0) |
|
else: |
|
results['diverse_programs_stats']['failed'] += 1 |
|
if problem_id not in results['diverse_programs_stats']['failed_problem_ids']: |
|
results['diverse_programs_stats']['failed_problem_ids'].append(problem_id) |
|
|
|
|
|
reward_step = result['steps'].get('reward_computation', {}) |
|
rewards = reward_step.get('rewards', {}) |
|
|
|
|
|
for task_type, type_rewards in rewards.get('rewards_by_type', {}).items(): |
|
if type_rewards: |
|
results['reasoning_task_stats'][task_type]['total'] += 1 |
|
|
|
|
|
task_accuracies = [reward['basic_accuracy'] for reward in type_rewards] |
|
problem_avg_accuracy = sum(task_accuracies) / len(task_accuracies) |
|
|
|
|
|
results['reasoning_task_stats'][task_type]['total_accuracy'] += problem_avg_accuracy |
|
|
|
|
|
if problem_avg_accuracy > 0: |
|
results['reasoning_task_stats'][task_type]['correct'] += 1 |
|
|
|
|
|
if problem_avg_accuracy == 0.0: |
|
results['reasoning_task_stats'][task_type]['accuracy_0_count'] += 1 |
|
elif problem_avg_accuracy == 1.0: |
|
results['reasoning_task_stats'][task_type]['accuracy_1_count'] += 1 |
|
|
|
|
|
|
|
problem_result = { |
|
'problem_id': problem_id, |
|
'success': result['success'], |
|
'error': result.get('error'), |
|
'step_results': step_results, |
|
'initial_solution_correct': initial_solution_correct, |
|
'reasoning_tasks_correct': {}, |
|
'time_seconds': problem_duration |
|
} |
|
|
|
if result['success']: |
|
|
|
reward_step = result['steps'].get('reward_computation', {}) |
|
rewards = reward_step.get('rewards', {}) |
|
|
|
for task_type, type_rewards in rewards.get('rewards_by_type', {}).items(): |
|
correct_count = sum(1 for r in type_rewards if r['basic_accuracy'] > 0) |
|
total_count = len(type_rewards) |
|
accuracy_0_count = sum(1 for r in type_rewards if r['basic_accuracy'] == 0) |
|
accuracy_1_count = sum(1 for r in type_rewards if r['basic_accuracy'] == 1) |
|
|
|
|
|
problem_average = sum(r['basic_accuracy'] for r in type_rewards) / len(type_rewards) if type_rewards else 0.0 |
|
|
|
problem_result['reasoning_tasks_correct'][task_type] = { |
|
'correct_count': correct_count, |
|
'total_count': total_count, |
|
'accuracy_0_count': accuracy_0_count, |
|
'accuracy_1_count': accuracy_1_count, |
|
'problem_average_accuracy': problem_average, |
|
'summary': f"{correct_count}/{total_count} (avg: {problem_average:.3f})" |
|
} |
|
|
|
|
|
results['timing_stats']['problem_times'].append({ |
|
'problem_id': problem_id, |
|
'time_seconds': problem_duration, |
|
'time_formatted': f"{problem_duration:.2f}s" |
|
}) |
|
|
|
results['problem_results'].append(problem_result) |
|
|
|
|
|
if result['success']: |
|
logger.log_info(f" โ
Success - Initial: {'โ
' if problem_result['initial_solution_correct'] else 'โ'}") |
|
else: |
|
logger.log_error(f" โ Failed: {result.get('error', 'Unknown error')}") |
|
|
|
except Exception as e: |
|
|
|
problem_end_time = time.time() |
|
problem_duration = problem_end_time - problem_start_time |
|
|
|
logger.log_error(f" ๐ฅ Exception during pipeline execution: {e}") |
|
logger.log_error(f" ๐ Problem Loading: โ (Exception)") |
|
logger.log_error(f" ๐ค LLM Generation: โ (Exception)") |
|
logger.log_error(f" ๐ Solution Evaluation: โ (Exception)") |
|
logger.log_error(f" ๐ IPO Extraction: โ (Exception)") |
|
logger.log_error(f" ๐ Task Generation: โ (Exception)") |
|
logger.log_error(f" ๐ง Task Evaluation: โ (Exception)") |
|
|
|
|
|
results['initial_solution_stats']['total'] += 1 |
|
|
|
if problem_id not in results['initial_solution_stats']['first_attempt_failed_problem_ids']: |
|
results['initial_solution_stats']['first_attempt_failed_problem_ids'].append(problem_id) |
|
if problem_id not in results['initial_solution_stats']['never_success_problem_ids']: |
|
results['initial_solution_stats']['never_success_problem_ids'].append(problem_id) |
|
if problem_id not in results['initial_solution_stats']['failed_problem_ids']: |
|
results['initial_solution_stats']['failed_problem_ids'].append(problem_id) |
|
|
|
results['ipo_extraction_stats']['total_attempts'] += 1 |
|
results['ipo_extraction_stats']['failed'] += 1 |
|
if problem_id not in results['ipo_extraction_stats']['failed_problem_ids']: |
|
results['ipo_extraction_stats']['failed_problem_ids'].append(problem_id) |
|
|
|
|
|
results['problem_results'].append({ |
|
'problem_id': problem_id, |
|
'success': False, |
|
'error': str(e), |
|
'step_results': { |
|
'problem_loading': False, |
|
'llm_generation': False, |
|
'solution_evaluation': False, |
|
'ipo_extraction': False, |
|
'input_generation': False, |
|
'task_generation': False, |
|
'task_evaluation': False |
|
}, |
|
'initial_solution_correct': False, |
|
'reasoning_tasks_correct': {}, |
|
'time_seconds': problem_duration |
|
}) |
|
|
|
|
|
results['timing_stats']['problem_times'].append({ |
|
'problem_id': problem_id, |
|
'time_seconds': problem_duration, |
|
'time_formatted': f"{problem_duration:.2f}s" |
|
}) |
|
|
|
|
|
end_total_time = time.time() |
|
total_duration = end_total_time - start_total_time |
|
|
|
|
|
results['timing_stats']['total_time_seconds'] = total_duration |
|
if len(problems) > 0: |
|
results['timing_stats']['average_time_per_problem'] = total_duration / len(problems) |
|
|
|
|
|
logger.log_info("๐ Computing final statistics...") |
|
|
|
|
|
input_stats = results['input_generation_stats'] |
|
if input_stats['successful'] > 0: |
|
input_stats['average_inputs_per_problem'] = input_stats['total_generated_inputs'] / input_stats['successful'] |
|
|
|
|
|
current_stats = results['current_evaluation_stats'] |
|
if current_stats['total_rounds'] > 0: |
|
current_stats['average_success_rate'] = current_stats['total_success_rounds'] / current_stats['total_rounds'] |
|
|
|
|
|
diverse_stats = results['diverse_programs_stats'] |
|
if diverse_stats['successful'] > 0: |
|
diverse_stats['average_programs_per_problem'] = diverse_stats['total_programs_generated'] / diverse_stats['successful'] |
|
diverse_stats['average_ipo_per_problem'] = diverse_stats['total_ipo_triples'] / diverse_stats['successful'] |
|
|
|
|
|
logger.log_info(f"โฑ๏ธ Total execution time: {total_duration:.2f}s ({total_duration/60:.1f}min)") |
|
logger.log_info(f"โฑ๏ธ Average time per problem: {results['timing_stats']['average_time_per_problem']:.2f}s") |
|
|
|
|
|
initial_stats = results['initial_solution_stats'] |
|
if initial_stats['total'] > 0: |
|
|
|
first_attempt_accuracy = initial_stats['first_attempt_correct'] / initial_stats['total'] |
|
logger.log_info(f"๐ First Attempt Accuracy: {first_attempt_accuracy:.3f} ({initial_stats['first_attempt_correct']}/{initial_stats['total']})") |
|
|
|
|
|
at_least_once_accuracy = initial_stats['at_least_once_correct'] / initial_stats['total'] |
|
logger.log_info(f"๐ At-Least-Once Success Rate: {at_least_once_accuracy:.3f} ({initial_stats['at_least_once_correct']}/{initial_stats['total']})") |
|
|
|
|
|
if initial_stats['total_attempts'] > 0: |
|
average_accuracy = initial_stats['total_successes'] / initial_stats['total_attempts'] |
|
logger.log_info(f"๐ Average Success Rate (5 attempts): {average_accuracy:.3f} ({initial_stats['total_successes']}/{initial_stats['total_attempts']})") |
|
|
|
logger.log_info(f"๐ First attempt failed problems: {len(initial_stats['first_attempt_failed_problem_ids'])}/{initial_stats['total']}") |
|
logger.log_info(f"๐ Never success problems: {len(initial_stats['never_success_problem_ids'])}/{initial_stats['total']}") |
|
|
|
|
|
ipo_stats = results['ipo_extraction_stats'] |
|
if ipo_stats['total_attempts'] > 0: |
|
ipo_success_rate = ipo_stats['successful'] / ipo_stats['total_attempts'] |
|
logger.log_info(f"๐ IPO Extraction Success Rate: {ipo_success_rate:.3f} ({ipo_stats['successful']}/{ipo_stats['total_attempts']})") |
|
logger.log_info(f"๐ IPO Extraction Failed: {ipo_stats['failed']} problems") |
|
|
|
|
|
if input_stats['total_attempts'] > 0: |
|
input_success_rate = input_stats['successful'] / input_stats['total_attempts'] |
|
logger.log_info(f"๐ฒ Input Generation Success Rate: {input_success_rate:.3f} ({input_stats['successful']}/{input_stats['total_attempts']})") |
|
logger.log_info(f"๐ฒ Total Generated Inputs: {input_stats['total_generated_inputs']}") |
|
logger.log_info(f"๐ฒ Average Inputs per Problem: {input_stats['average_inputs_per_problem']:.2f}") |
|
|
|
|
|
if current_stats['total_attempts'] > 0: |
|
current_success_rate = current_stats['successful'] / current_stats['total_attempts'] |
|
logger.log_info(f"๐ Current Evaluation Success Rate: {current_success_rate:.3f} ({current_stats['successful']}/{current_stats['total_attempts']})") |
|
logger.log_info(f"๐ Total Current Rounds: {current_stats['total_rounds']}") |
|
logger.log_info(f"๐ Average Success Rate: {current_stats['average_success_rate']:.3f}") |
|
|
|
|
|
if diverse_stats['total_attempts'] > 0: |
|
diverse_success_rate = diverse_stats['successful'] / diverse_stats['total_attempts'] |
|
logger.log_info(f"๐จ Diverse Programs Success Rate: {diverse_success_rate:.3f} ({diverse_stats['successful']}/{diverse_stats['total_attempts']})") |
|
logger.log_info(f"๐จ Total Programs Generated: {diverse_stats['total_programs_generated']}") |
|
logger.log_info(f"๐จ Total Valid Programs: {diverse_stats['total_valid_programs']}") |
|
logger.log_info(f"๐จ Total IPO Triples: {diverse_stats['total_ipo_triples']}") |
|
logger.log_info(f"๐จ Average Programs per Problem: {diverse_stats['average_programs_per_problem']:.2f}") |
|
logger.log_info(f"๐จ Average IPO per Problem: {diverse_stats['average_ipo_per_problem']:.2f}") |
|
|
|
|
|
for task_type, stats in results['reasoning_task_stats'].items(): |
|
if stats['total'] > 0: |
|
task_accuracy = stats['correct'] / stats['total'] |
|
logger.log_info(f"๐ {task_type.title()} Task Accuracy: {task_accuracy:.3f} ({stats['correct']}/{stats['total']})") |
|
logger.log_info(f" - Accuracy=0: {stats['accuracy_0_count']}, Accuracy=1: {stats['accuracy_1_count']}") |
|
|
|
|
|
result_file = os.path.join(output_dir, f"batch_evaluation_results.json") |
|
with open(result_file, 'w', encoding='utf-8') as f: |
|
json.dump(results, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
summary_file = os.path.join(output_dir, f"evaluation_summary.md") |
|
with open(summary_file, 'w', encoding='utf-8') as f: |
|
f.write(f"# TestTime RLVR Batch Evaluation Report\n\n") |
|
f.write(f"**Model**: {args.model}\n") |
|
f.write(f"**Benchmark**: {args.benchmark}\n") |
|
f.write(f"**Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") |
|
f.write(f"**Total Problems**: {results['initial_solution_stats']['total']}\n") |
|
f.write(f"**Output Directory**: `{output_dir}`\n\n") |
|
|
|
f.write(f"## Directory Structure\n") |
|
f.write(f"```\n") |
|
f.write(f"{output_dir}/\n") |
|
f.write(f"โโโ batch_evaluation_results.json # ์ ์ฒด ํต๊ณ ๊ฒฐ๊ณผ\n") |
|
f.write(f"โโโ evaluation_summary.md # ์ด ์์ฝ ํ์ผ\n") |
|
f.write(f"โโโ {args.benchmark}/ # ๋ฒค์น๋งํฌ๋ณ ์์ธ ๊ฒฐ๊ณผ\n") |
|
f.write(f" โโโ [problem_id]/ # ๊ฐ ๋ฌธ์ ๋ณ ๋๋ ํ ๋ฆฌ\n") |
|
f.write(f" โโโ initial_solution/ # ์ด๊ธฐ LLM ์๋ฃจ์
\n") |
|
f.write(f" โโโ ipo_triples/ # IPO ํธ๋ฆฌํ\n") |
|
f.write(f" โโโ task_prompts/ # ์์ฑ๋ ํ์คํฌ\n") |
|
f.write(f" โโโ llm_responses/ # LLM ์๋ต\n") |
|
f.write(f" โโโ [problem_id]_summary.json # ๋ฌธ์ ๋ณ ์์ฝ\n") |
|
f.write(f"```\n\n") |
|
|
|
|
|
f.write(f"## Timing Statistics\n") |
|
f.write(f"- **Total Execution Time**: {total_duration:.2f}s ({total_duration/60:.1f} minutes)\n") |
|
f.write(f"- **Average Time per Problem**: {results['timing_stats']['average_time_per_problem']:.2f}s\n") |
|
f.write(f"- **Fastest Problem**: {min(results['timing_stats']['problem_times'], key=lambda x: x['time_seconds'])['time_formatted']} ({min(results['timing_stats']['problem_times'], key=lambda x: x['time_seconds'])['problem_id']})\n") |
|
f.write(f"- **Slowest Problem**: {max(results['timing_stats']['problem_times'], key=lambda x: x['time_seconds'])['time_formatted']} ({max(results['timing_stats']['problem_times'], key=lambda x: x['time_seconds'])['problem_id']})\n\n") |
|
|
|
f.write(f"## Current Evaluation Performance (5 attempts per problem)\n\n") |
|
|
|
|
|
first_attempt_accuracy = initial_stats['first_attempt_correct'] / initial_stats['total'] if initial_stats['total'] > 0 else 0 |
|
f.write(f"### 1. First Attempt Accuracy\n") |
|
f.write(f"- **Accuracy**: {first_attempt_accuracy:.3f} ({initial_stats['first_attempt_correct']}/{initial_stats['total']})\n") |
|
f.write(f"- **Description**: Success rate based on first attempt only\n\n") |
|
|
|
|
|
at_least_once_accuracy = initial_stats['at_least_once_correct'] / initial_stats['total'] if initial_stats['total'] > 0 else 0 |
|
f.write(f"### 2. At-Least-Once Success Rate\n") |
|
f.write(f"- **Accuracy**: {at_least_once_accuracy:.3f} ({initial_stats['at_least_once_correct']}/{initial_stats['total']})\n") |
|
f.write(f"- **Description**: Problems where at least 1 out of 5 attempts succeeded\n\n") |
|
|
|
|
|
if initial_stats['total_attempts'] > 0: |
|
average_accuracy = initial_stats['total_successes'] / initial_stats['total_attempts'] |
|
f.write(f"### 3. Average Success Rate (5 attempts)\n") |
|
f.write(f"- **Accuracy**: {average_accuracy:.3f}\n") |
|
f.write(f"- **Description**: Average of individual problem success rates across 5 attempts\n") |
|
f.write(f"- **Total Evaluations**: {initial_stats['total_attempts']} ({initial_stats['total']} ร 5)\n") |
|
f.write(f"- **Total Successes**: {initial_stats['total_successes']}\n\n") |
|
|
|
|
|
f.write(f"### Additional Statistics\n") |
|
f.write(f"- **Syntax Errors**: {initial_stats['syntax_errors']}\n") |
|
f.write(f"- **Evaluation Errors**: {initial_stats['evaluation_errors']}\n\n") |
|
|
|
|
|
f.write(f"## Pipeline Step Success Statistics\n") |
|
|
|
|
|
step_stats = { |
|
'problem_loading': 0, |
|
'llm_generation': 0, |
|
'solution_evaluation': 0, |
|
'ipo_extraction': 0, |
|
'input_generation': 0, |
|
'task_generation': 0, |
|
'task_evaluation': 0 |
|
} |
|
|
|
for problem_result in results['problem_results']: |
|
if 'step_results' in problem_result: |
|
for step, success in problem_result['step_results'].items(): |
|
if success: |
|
step_stats[step] += 1 |
|
|
|
total_problems = results['initial_solution_stats']['total'] |
|
|
|
f.write(f"- **Problem Loading**: {step_stats['problem_loading']}/{total_problems} ({step_stats['problem_loading']/total_problems*100:.1f}%)\n") |
|
f.write(f"- **LLM Generation**: {step_stats['llm_generation']}/{total_problems} ({step_stats['llm_generation']/total_problems*100:.1f}%)\n") |
|
f.write(f"- **Solution Evaluation**: {step_stats['solution_evaluation']}/{total_problems} ({step_stats['solution_evaluation']/total_problems*100:.1f}%)\n") |
|
f.write(f"- **IPO Extraction**: {step_stats['ipo_extraction']}/{total_problems} ({step_stats['ipo_extraction']/total_problems*100:.1f}%)\n") |
|
f.write(f"- **Input Generation**: {step_stats['input_generation']}/{total_problems} ({step_stats['input_generation']/total_problems*100:.1f}%)\n") |
|
f.write(f"- **Task Generation**: {step_stats['task_generation']}/{total_problems} ({step_stats['task_generation']/total_problems*100:.1f}%)\n") |
|
f.write(f"- **Task Evaluation**: {step_stats['task_evaluation']}/{total_problems} ({step_stats['task_evaluation']/total_problems*100:.1f}%)\n\n") |
|
|
|
|
|
ipo_stats = results['ipo_extraction_stats'] |
|
if ipo_stats['total_attempts'] > 0: |
|
ipo_success_rate = ipo_stats['successful'] / ipo_stats['total_attempts'] |
|
f.write(f"## IPO Extraction Performance\n") |
|
f.write(f"- **Total Attempts**: {ipo_stats['total_attempts']}\n") |
|
f.write(f"- **Successful**: {ipo_stats['successful']}\n") |
|
f.write(f"- **Failed**: {ipo_stats['failed']}\n") |
|
f.write(f"- **Success Rate**: {ipo_success_rate:.3f}\n\n") |
|
|
|
|
|
if ipo_stats['failed_problem_ids']: |
|
f.write(f"### IPO Extraction Failed Problem IDs\n") |
|
for problem_id in ipo_stats['failed_problem_ids']: |
|
f.write(f"- `{problem_id}`\n") |
|
f.write(f"\n") |
|
|
|
|
|
input_gen_stats = results.get('input_generation_stats', {}) |
|
if input_gen_stats and input_gen_stats['total_attempts'] > 0: |
|
gen_success_rate = input_gen_stats['successful'] / input_gen_stats['total_attempts'] |
|
f.write(f"## Input Generation Performance\n") |
|
f.write(f"- **Total Attempts**: {input_gen_stats['total_attempts']}\n") |
|
f.write(f"- **Successful**: {input_gen_stats['successful']}\n") |
|
f.write(f"- **Failed**: {input_gen_stats['failed']}\n") |
|
f.write(f"- **Success Rate**: {gen_success_rate:.3f}\n") |
|
f.write(f"- **Total Generated Inputs**: {input_gen_stats['total_generated_inputs']}\n") |
|
f.write(f"- **Average Inputs per Problem**: {input_gen_stats['average_inputs_per_problem']:.2f}\n\n") |
|
|
|
|
|
if input_gen_stats.get('problems_with_generation'): |
|
f.write(f"### Problems with Input Generation\n") |
|
f.write(f"Total: {len(input_gen_stats['problems_with_generation'])} problems\n") |
|
|
|
for i, problem_id in enumerate(input_gen_stats['problems_with_generation'][:10]): |
|
f.write(f"- `{problem_id}`\n") |
|
if len(input_gen_stats['problems_with_generation']) > 10: |
|
f.write(f"- ... and {len(input_gen_stats['problems_with_generation']) - 10} more\n") |
|
f.write(f"\n") |
|
|
|
|
|
f.write(f"## Problem Classification\n\n") |
|
|
|
|
|
f.write(f"### ๐ First Attempt Results\n") |
|
f.write(f"- **Success**: {initial_stats['first_attempt_correct']} problems\n") |
|
f.write(f"- **Failure**: {len(initial_stats['first_attempt_failed_problem_ids'])} problems\n\n") |
|
|
|
|
|
f.write(f"### ๐ Five-Attempt Results\n") |
|
f.write(f"- **At-Least-Once Success**: {initial_stats['at_least_once_correct']} problems\n") |
|
f.write(f"- **Never Success**: {len(initial_stats['never_success_problem_ids'])} problems\n\n") |
|
|
|
|
|
if initial_stats['first_attempt_failed_problem_ids']: |
|
f.write(f"### First Attempt Failed Problem IDs\n") |
|
for problem_id in initial_stats['first_attempt_failed_problem_ids']: |
|
f.write(f"- `{problem_id}`\n") |
|
f.write(f"\n") |
|
|
|
|
|
if initial_stats['never_success_problem_ids']: |
|
f.write(f"### Never Success Problem IDs (0/5)\n") |
|
for problem_id in initial_stats['never_success_problem_ids']: |
|
f.write(f"- `{problem_id}`\n") |
|
f.write(f"\n") |
|
|
|
f.write(f"## Reasoning Task Performance\n") |
|
f.write(f"*Note: Statistics based on problem-level average accuracy for each task type*\n\n") |
|
|
|
for task_type, stats in results['reasoning_task_stats'].items(): |
|
if stats['total'] > 0: |
|
|
|
overall_accuracy = stats['total_accuracy'] / stats['total'] |
|
partial_count = stats['total'] - stats['accuracy_0_count'] - stats['accuracy_1_count'] |
|
|
|
f.write(f"### {task_type.title()} Tasks\n") |
|
f.write(f"- **Total Problems**: {stats['total']} (problems that had {task_type} tasks)\n") |
|
f.write(f"- **Problems with >0 Avg Accuracy**: {stats['correct']}\n") |
|
f.write(f"- **Overall Success Rate**: {overall_accuracy:.3f}\n") |
|
f.write(f"- **Problems with Avg Accuracy = 0.0**: {stats['accuracy_0_count']} problems\n") |
|
f.write(f"- **Problems with Avg Accuracy = 1.0**: {stats['accuracy_1_count']} problems\n") |
|
f.write(f"- **Problems with Partial Accuracy**: {partial_count} problems\n\n") |
|
|
|
|
|
f.write(generate_detailed_classification(output_dir, args.benchmark)) |
|
|
|
f.write(f"## Files\n") |
|
f.write(f"- **Detailed Results**: {result_file}\n") |
|
f.write(f"- **Summary Report**: {summary_file}\n") |
|
f.write(f"- **First Attempt Failed Problems**: See 'First Attempt Failed Problem IDs' section above\n") |
|
f.write(f"- **Never Success Problems**: See 'Never Success Problem IDs' section above\n") |
|
if ipo_stats['failed_problem_ids']: |
|
f.write(f"- **IPO Extraction Failed Problems**: See 'IPO Extraction Failed Problem IDs' section above and ipo_extraction_failed_problems.txt\n") |
|
|
|
|
|
if ipo_stats['failed_problem_ids']: |
|
failed_ipo_file = os.path.join(output_dir, f"ipo_extraction_failed_problems.txt") |
|
with open(failed_ipo_file, 'w', encoding='utf-8') as f: |
|
f.write(f"# IPO Extraction Failed Problems\n") |
|
f.write(f"# Benchmark: {args.benchmark}\n") |
|
f.write(f"# Model: {args.model}\n") |
|
f.write(f"# Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") |
|
f.write(f"# Total Failed: {len(ipo_stats['failed_problem_ids'])}/{ipo_stats['total_attempts']}\n") |
|
f.write(f"# Success Rate: {(ipo_stats['successful'] / ipo_stats['total_attempts']):.3f}\n") |
|
f.write(f"#\n") |
|
for problem_id in ipo_stats['failed_problem_ids']: |
|
f.write(f"{problem_id}\n") |
|
|
|
logger.log_info(f"๐ IPO extraction failed problems saved: {failed_ipo_file}") |
|
|
|
logger.log_info(f"โ
Batch evaluation completed!") |
|
logger.log_info(f"๐ Results saved to: {output_dir}") |
|
logger.log_info(f" ๐ Summary report: evaluation_summary.md") |
|
logger.log_info(f" ๐ Statistics JSON: batch_evaluation_results.json") |
|
logger.log_info(f" ๐ Detailed results: {args.benchmark}/[problem_id]/") |
|
logger.log_info(f" โโโ initial_solution/ # LLM ์๋ฃจ์
") |
|
logger.log_info(f" โโโ ipo_triples/ # IPO ํธ๋ฆฌํ") |
|
logger.log_info(f" โโโ task_prompts/ # ์์ฑ๋ ํ์คํฌ") |
|
logger.log_info(f" โโโ llm_responses/ # LLM ์๋ต") |
|
|
|
if ipo_stats['failed_problem_ids']: |
|
logger.log_info(f"๐ IPO failed problems: {len(ipo_stats['failed_problem_ids'])} problems saved to ipo_extraction_failed_problems.txt") |
|
|
|
|
|
try: |
|
import gc |
|
import torch |
|
|
|
|
|
if hasattr(model, 'llm_engine'): |
|
|
|
if hasattr(model.llm_engine, 'model_executor'): |
|
logger.log_info("๐ Shutting down VLLM model executor...") |
|
model.llm_engine.model_executor.shutdown() |
|
|
|
del model.llm_engine |
|
|
|
|
|
del model |
|
|
|
|
|
if torch.cuda.is_available(): |
|
torch.cuda.empty_cache() |
|
torch.cuda.synchronize() |
|
|
|
|
|
gc.collect() |
|
|
|
logger.log_info("๐งน Model cleanup completed properly") |
|
|
|
except Exception as e: |
|
logger.log_warning(f"โ ๏ธ Model cleanup failed: {e}") |
|
|
|
|
|
logger.log_warning("๐จ Attempting emergency cleanup...") |
|
try: |
|
import psutil |
|
|
|
|
|
current_pid = os.getpid() |
|
parent = psutil.Process(current_pid) |
|
|
|
for child in parent.children(recursive=True): |
|
try: |
|
child.terminate() |
|
child.wait(timeout=2) |
|
except (psutil.NoSuchProcess, psutil.TimeoutExpired): |
|
try: |
|
child.kill() |
|
except psutil.NoSuchProcess: |
|
pass |
|
|
|
logger.log_warning("๐จ Emergency cleanup completed") |
|
except Exception as cleanup_error: |
|
logger.log_error(f"๐ฅ Emergency cleanup also failed: {cleanup_error}") |
|
|
|
try: |
|
os._exit(0) |
|
except: |
|
pass |
|
|
|
return True |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description='Batch TestTime RLVR Evaluation') |
|
parser.add_argument('--model', type=str, default='Qwen/Qwen2.5-7B', |
|
help='Model name to evaluate') |
|
parser.add_argument('--benchmark', type=str, choices=['humaneval', 'mbpp'], |
|
default='mbpp', help='Benchmark to evaluate') |
|
parser.add_argument('--max_problems', type=int, default=10, |
|
help='Maximum number of problems to evaluate (0 = all)') |
|
parser.add_argument('--gpu', type=int, default=6, help='GPU ID to use') |
|
parser.add_argument('--output_dir', type=str, |
|
default='./batch_results', |
|
help='Output directory for results') |
|
parser.add_argument('--resume', action='store_true', |
|
help='Resume from previously completed problems') |
|
parser.add_argument('--start_from', type=str, default=None, |
|
help='Start from specific problem ID (e.g., Mbpp/100)') |
|
|
|
args = parser.parse_args() |
|
|
|
|
|
if 'CUDA_VISIBLE_DEVICES' not in os.environ: |
|
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) |
|
|
|
print(f"๐ฏ CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set')}") |
|
print(f"๐ฏ Using GPU argument: {args.gpu}") |
|
|
|
|
|
os.makedirs(args.output_dir, exist_ok=True) |
|
|
|
try: |
|
success = run_batch_evaluation(args) |
|
exit_code = 0 if success else 1 |
|
except Exception as e: |
|
print(f"๐ฅ Batch evaluation failed: {e}") |
|
traceback.print_exc() |
|
exit_code = 1 |
|
|
|
print(f"\n๐ช Exiting with code {exit_code}") |
|
|
|
|
|
try: |
|
os._exit(exit_code) |
|
except: |
|
sys.exit(exit_code) |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |