neural-mesh-v2 / test_evalplus_single.py
hjkim00's picture
Restore all essential files - code, configs, and MBPP/HumanEval data
24c2665 verified
raw
history blame
3.34 kB
#!/usr/bin/env python3
"""
EvalPlus ๋ฐฉ์‹์œผ๋กœ ๋‹จ์ผ MBPP ๋ฌธ์ œ ํ…Œ์ŠคํŠธ
"""
import json
import sys
from pathlib import Path
# Add parent directory to path
sys.path.append(str(Path(__file__).parent))
from absolute_zero_reasoner.testtime.solution_generator import InitialSolutionGenerator
from absolute_zero_reasoner.testtime.config import TestTimeConfig
from absolute_zero_reasoner.testtime.logger import TestTimeLogger
def load_single_problem(task_id: str = "Mbpp/2"):
"""๋‹จ์ผ MBPP ๋ฌธ์ œ ๋กœ๋“œ"""
dataset_path = Path("/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/data/MbppPlus.jsonl")
with open(dataset_path, 'r') as f:
for line in f:
problem = json.loads(line.strip())
if problem['task_id'] == task_id:
return problem
raise ValueError(f"Problem {task_id} not found")
def test_evalplus_evaluation():
"""EvalPlus ์Šคํƒ€์ผ ํ‰๊ฐ€ ํ…Œ์ŠคํŠธ"""
# ์„ค์ •
config = TestTimeConfig()
logger = TestTimeLogger()
# ๊ฐ„๋‹จํ•œ ๋ชจ์˜ ๋ชจ๋ธ๊ณผ ํ† ํฌ๋‚˜์ด์ € (์‹ค์ œ ํ‰๊ฐ€๋Š” ์•ˆํ•จ)
class MockModel:
def device(self):
return 'cpu'
class MockTokenizer:
eos_token = '</s>'
pad_token = '</s>'
# ์†”๋ฃจ์…˜ ์ƒ์„ฑ๊ธฐ ์ดˆ๊ธฐํ™”
generator = InitialSolutionGenerator(
model=MockModel(),
tokenizer=MockTokenizer(),
config=config,
logger=logger,
use_vllm=False
)
# ๋ฌธ์ œ ๋กœ๋“œ
problem = load_single_problem("Mbpp/2")
print(f"Testing problem: {problem['task_id']}")
print(f"Entry point: {problem['entry_point']}")
print(f"Base inputs: {len(problem.get('base_input', []))}")
print(f"Plus inputs: {len(problem.get('plus_input', []))}")
# ์ •๋‹ต ์†”๋ฃจ์…˜์œผ๋กœ ํ…Œ์ŠคํŠธ
solution = problem['canonical_solution']
print("\nTesting with canonical solution:")
print(solution[:200] + "..." if len(solution) > 200 else solution)
# ํ‰๊ฐ€ ์‹คํ–‰
result = generator.evaluate_solution(problem, solution)
# ๊ฒฐ๊ณผ ์ถœ๋ ฅ
print("\n=== Evaluation Result ===")
print(f"Correct: {result['correct']}")
print(f"Base passed: {result['base_passed']}/{result['base_total']}")
print(f"Plus passed: {result['plus_passed']}/{result['plus_total']}")
print(f"Total passed: {result['passed_tests']}/{result['total_tests']}")
if result['error']:
print(f"Error: {result['error']}")
# ์ž˜๋ชป๋œ ์†”๋ฃจ์…˜์œผ๋กœ๋„ ํ…Œ์ŠคํŠธ
wrong_solution = """
def similar_elements(test_tup1, test_tup2):
# ์˜๋„์ ์œผ๋กœ ์ž˜๋ชป๋œ ์†”๋ฃจ์…˜ - ๊ต์ง‘ํ•ฉ์ด ์•„๋‹Œ ํ•ฉ์ง‘ํ•ฉ ๋ฐ˜ํ™˜
return tuple(set(test_tup1) | set(test_tup2))
"""
print("\n\nTesting with wrong solution:")
print(wrong_solution)
result2 = generator.evaluate_solution(problem, wrong_solution)
print("\n=== Evaluation Result (Wrong Solution) ===")
print(f"Correct: {result2['correct']}")
print(f"Base passed: {result2['base_passed']}/{result2['base_total']}")
print(f"Plus passed: {result2['plus_passed']}/{result2['plus_total']}")
print(f"Total passed: {result2['passed_tests']}/{result2['total_tests']}")
if result2['error']:
print(f"Error: {result2['error']}")
if __name__ == "__main__":
test_evalplus_evaluation()