neural-mesh-v2 / test /list_benchmark_problems.py
hjkim00's picture
Restore all essential files - code, configs, and MBPP/HumanEval data
24c2665 verified
raw
history blame
7.74 kB
#!/usr/bin/env python3
"""
๋ฒค์น˜๋งˆํฌ ๋ฌธ์ œ ID ๋ชฉ๋ก ์กฐํšŒ ๋„๊ตฌ
์ง€์›ํ•˜๋Š” ๋ฒค์น˜๋งˆํฌ์˜ ๋ชจ๋“  ๋ฌธ์ œ ID๋ฅผ ํ™•์ธํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
HumanEval+, MBPP+, LiveCodeBench ์ง€์›
"""
import os
import sys
import json
import argparse
from pathlib import Path
from datetime import datetime
# TestTime RLVR ๋ชจ๋“ˆ ์ž„ํฌํŠธ
sys.path.append('/home/ubuntu/RLVR/TestTime-RLVR-v2')
def load_jsonl(file_path):
"""JSONL ํŒŒ์ผ ๋กœ๋“œ"""
if not os.path.exists(file_path):
return []
with open(file_path, 'r', encoding='utf-8') as f:
return [json.loads(line.strip()) for line in f if line.strip()]
def list_humaneval_problems(data_path):
"""HumanEval+ ๋ฌธ์ œ ๋ชฉ๋ก"""
print("๐Ÿ” HumanEval+ ๋ฌธ์ œ ๋ชฉ๋ก")
print("="*60)
problems = load_jsonl(data_path)
if not problems:
print("โŒ ๋ฐ์ดํ„ฐ ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
return []
task_ids = []
print(f"๐Ÿ“Š ์ด {len(problems)}๊ฐœ ๋ฌธ์ œ ๋ฐœ๊ฒฌ")
for i, problem in enumerate(problems):
task_id = problem.get('task_id', f'Unknown_{i}')
task_ids.append(task_id)
# ๋ชจ๋“  ๋ฌธ์ œ ID๋“ค์„ 10๊ฐœ์”ฉ ๋ฌถ์–ด์„œ ์ถœ๋ ฅ
print("\n๐Ÿ“‹ ์ „์ฒด ๋ฌธ์ œ ID ๋ชฉ๋ก:")
for j in range(0, len(task_ids), 10):
batch = task_ids[j:j+10]
print(f" {', '.join(batch)}")
return task_ids
def list_mbpp_problems(data_path):
"""MBPP+ ๋ฌธ์ œ ๋ชฉ๋ก"""
print("๐Ÿ” MBPP+ ๋ฌธ์ œ ๋ชฉ๋ก")
print("="*60)
problems = load_jsonl(data_path)
if not problems:
print("โŒ ๋ฐ์ดํ„ฐ ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
return []
task_ids = []
print(f"๐Ÿ“Š ์ด {len(problems)}๊ฐœ ๋ฌธ์ œ ๋ฐœ๊ฒฌ")
for i, problem in enumerate(problems):
task_id = problem.get('task_id', f'Unknown_{i}')
task_ids.append(task_id)
# ๋ชจ๋“  ๋ฌธ์ œ ID๋“ค์„ 10๊ฐœ์”ฉ ๋ฌถ์–ด์„œ ์ถœ๋ ฅ
print("\n๐Ÿ“‹ ์ „์ฒด ๋ฌธ์ œ ID ๋ชฉ๋ก:")
for j in range(0, len(task_ids), 10):
batch = task_ids[j:j+10]
print(f" {', '.join(batch)}")
return task_ids
def list_lcb_problems(data_path):
"""LiveCodeBench ๋ฌธ์ œ ๋ชฉ๋ก"""
print("๐Ÿ” LiveCodeBench ๋ฌธ์ œ ๋ชฉ๋ก")
print("="*60)
# LiveCodeBench๋Š” ๋””๋ ‰ํ† ๋ฆฌ ๊ตฌ์กฐ๊ฐ€ ๋‹ค๋ฅผ ์ˆ˜ ์žˆ์Œ
lcb_files = list(Path(data_path).glob("**/*.jsonl")) if os.path.exists(data_path) else []
if not lcb_files:
print("โŒ LiveCodeBench ๋ฐ์ดํ„ฐ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
return []
all_task_ids = []
print(f"๐Ÿ“Š {len(lcb_files)}๊ฐœ ํŒŒ์ผ ๋ฐœ๊ฒฌ")
for file_path in lcb_files[:5]: # ์ฒ˜์Œ 5๊ฐœ ํŒŒ์ผ๋งŒ ํ™•์ธ
print(f"\n๐Ÿ“ ํŒŒ์ผ: {file_path.name}")
problems = load_jsonl(file_path)
for i, problem in enumerate(problems[:10]): # ๊ฐ ํŒŒ์ผ์—์„œ 10๊ฐœ๋งŒ ํ‘œ์‹œ
task_id = problem.get('task_id', problem.get('id', f'LCB_{i}'))
all_task_ids.append(task_id)
prompt_preview = problem.get('prompt', problem.get('description', ''))[:80].replace('\n', ' ')
print(f" {len(all_task_ids):3d}. {task_id} - {prompt_preview}...")
if len(problems) > 10:
print(f" ... ({len(problems)-10}๊ฐœ ๋ฌธ์ œ ๋” ์žˆ์Œ)")
if len(lcb_files) > 5:
print(f"\n... ({len(lcb_files)-5}๊ฐœ ํŒŒ์ผ ๋” ์žˆ์Œ)")
return all_task_ids
def save_problem_list(task_ids, benchmark, output_dir):
"""๋ฌธ์ œ ๋ชฉ๋ก์„ ๋ฒค์น˜๋งˆํฌ๋ณ„ ๋””๋ ‰ํ† ๋ฆฌ์— ์ €์žฅ"""
# tmp/{benchmark}/ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
benchmark_dir = os.path.join(output_dir, benchmark)
os.makedirs(benchmark_dir, exist_ok=True)
# ์ „์ฒด ๋ฌธ์ œ ๋ชฉ๋ก ์ €์žฅ
all_problems_file = os.path.join(benchmark_dir, f"{benchmark}_all_problems.json")
output_data = {
'benchmark': benchmark,
'total_problems': len(task_ids),
'task_ids': task_ids,
'generated_at': datetime.now().isoformat(),
'data_source': f'{benchmark}_plus_dataset'
}
with open(all_problems_file, 'w', encoding='utf-8') as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
print(f"\n๐Ÿ’พ ๋ฌธ์ œ ๋ชฉ๋ก์ด ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค:")
print(f" ์ „์ฒด ๋ชฉ๋ก: {all_problems_file}")
def main():
parser = argparse.ArgumentParser(description='๋ฒค์น˜๋งˆํฌ ๋ฌธ์ œ ID ๋ชฉ๋ก ์กฐํšŒ')
parser.add_argument('--benchmark', type=str, default='all',
choices=['all', 'humaneval', 'mbpp', 'lcb'],
help='์กฐํšŒํ•  ๋ฒค์น˜๋งˆํฌ (all=๋ชจ๋“  ๋ฒค์น˜๋งˆํฌ)')
parser.add_argument('--save', action='store_true',
help='๊ฒฐ๊ณผ๋ฅผ JSON ํŒŒ์ผ๋กœ ์ €์žฅ')
parser.add_argument('--output_dir', type=str,
default='/home/ubuntu/RLVR/TestTime-RLVR-v2/tmp',
help='์ถœ๋ ฅ ๋””๋ ‰ํ† ๋ฆฌ')
args = parser.parse_args()
# ๋ฐ์ดํ„ฐ ๊ฒฝ๋กœ ์„ค์ • (์ƒ๋Œ€ ๊ฒฝ๋กœ๋กœ ์ˆ˜์ •)
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
humaneval_path = f'{base_dir}/evaluation/code_eval/data/HumanEvalPlus.jsonl'
mbpp_path = f'{base_dir}/evaluation/code_eval/data/MbppPlus.jsonl'
lcb_path = f'{base_dir}/evaluation/code_eval/coding/LiveCodeBench'
os.makedirs(args.output_dir, exist_ok=True)
print("๐Ÿš€ TestTime RLVR ๋ฒค์น˜๋งˆํฌ ๋ฌธ์ œ ๋ชฉ๋ก ์กฐํšŒ ๋„๊ตฌ")
print("="*80)
all_results = {}
if args.benchmark in ['all', 'humaneval']:
print("\n")
task_ids = list_humaneval_problems(humaneval_path)
all_results['humaneval'] = task_ids
if args.save and task_ids:
save_problem_list(task_ids, 'humaneval', args.output_dir)
if args.benchmark in ['all', 'mbpp']:
print("\n")
task_ids = list_mbpp_problems(mbpp_path)
all_results['mbpp'] = task_ids
if args.save and task_ids:
save_problem_list(task_ids, 'mbpp', args.output_dir)
if args.benchmark in ['all', 'lcb']:
print("\n")
task_ids = list_lcb_problems(lcb_path)
all_results['lcb'] = task_ids
if args.save and task_ids:
save_problem_list(task_ids, 'livecodebrench', args.output_dir)
# ์š”์•ฝ ์ •๋ณด
print("\n" + "="*80)
print("๐Ÿ“Š ๋ฒค์น˜๋งˆํฌ ์š”์•ฝ")
print("="*80)
total_problems = 0
for benchmark, task_ids in all_results.items():
if task_ids:
print(f"๐Ÿ“‹ {benchmark.upper()}: {len(task_ids)}๊ฐœ ๋ฌธ์ œ")
total_problems += len(task_ids)
# ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ๋ฌธ์ œ ID ์ƒ˜ํ”Œ ํ‘œ์‹œ
if task_ids:
print(f" ์ƒ˜ํ”Œ ID: {', '.join(task_ids[:5])}")
if len(task_ids) > 5:
print(f" ... (์ด {len(task_ids)}๊ฐœ)")
print(f"\n๐ŸŽฏ ์ „์ฒด ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ๋ฌธ์ œ: {total_problems}๊ฐœ")
# ์‚ฌ์šฉ๋ฒ• ์•ˆ๋‚ด
print("\n" + "="*80)
print("๐Ÿ’ก ์‚ฌ์šฉ๋ฒ•")
print("="*80)
print("ํ…Œ์ŠคํŠธ ์‹คํ–‰ ์˜ˆ์‹œ:")
if 'humaneval' in all_results and all_results['humaneval']:
sample_id = all_results['humaneval'][0]
print(f" python test_complete_pipeline.py --benchmark humaneval --problem_id \"{sample_id}\"")
if 'mbpp' in all_results and all_results['mbpp']:
sample_id = all_results['mbpp'][0]
print(f" python test_complete_pipeline.py --benchmark mbpp --problem_id \"{sample_id}\"")
print("\nํŠน์ • ๋ฌธ์ œ๋งŒ ํ™•์ธ:")
print(" python list_benchmark_problems.py --benchmark mbpp")
print(" python list_benchmark_problems.py --benchmark humaneval --save")
if __name__ == '__main__':
main()