File size: 3,338 Bytes
24c2665 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
#!/usr/bin/env python3
"""
EvalPlus ๋ฐฉ์์ผ๋ก ๋จ์ผ MBPP ๋ฌธ์ ํ
์คํธ
"""
import json
import sys
from pathlib import Path
# Add parent directory to path
sys.path.append(str(Path(__file__).parent))
from absolute_zero_reasoner.testtime.solution_generator import InitialSolutionGenerator
from absolute_zero_reasoner.testtime.config import TestTimeConfig
from absolute_zero_reasoner.testtime.logger import TestTimeLogger
def load_single_problem(task_id: str = "Mbpp/2"):
"""๋จ์ผ MBPP ๋ฌธ์ ๋ก๋"""
dataset_path = Path("/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/data/MbppPlus.jsonl")
with open(dataset_path, 'r') as f:
for line in f:
problem = json.loads(line.strip())
if problem['task_id'] == task_id:
return problem
raise ValueError(f"Problem {task_id} not found")
def test_evalplus_evaluation():
"""EvalPlus ์คํ์ผ ํ๊ฐ ํ
์คํธ"""
# ์ค์
config = TestTimeConfig()
logger = TestTimeLogger()
# ๊ฐ๋จํ ๋ชจ์ ๋ชจ๋ธ๊ณผ ํ ํฌ๋์ด์ (์ค์ ํ๊ฐ๋ ์ํจ)
class MockModel:
def device(self):
return 'cpu'
class MockTokenizer:
eos_token = '</s>'
pad_token = '</s>'
# ์๋ฃจ์
์์ฑ๊ธฐ ์ด๊ธฐํ
generator = InitialSolutionGenerator(
model=MockModel(),
tokenizer=MockTokenizer(),
config=config,
logger=logger,
use_vllm=False
)
# ๋ฌธ์ ๋ก๋
problem = load_single_problem("Mbpp/2")
print(f"Testing problem: {problem['task_id']}")
print(f"Entry point: {problem['entry_point']}")
print(f"Base inputs: {len(problem.get('base_input', []))}")
print(f"Plus inputs: {len(problem.get('plus_input', []))}")
# ์ ๋ต ์๋ฃจ์
์ผ๋ก ํ
์คํธ
solution = problem['canonical_solution']
print("\nTesting with canonical solution:")
print(solution[:200] + "..." if len(solution) > 200 else solution)
# ํ๊ฐ ์คํ
result = generator.evaluate_solution(problem, solution)
# ๊ฒฐ๊ณผ ์ถ๋ ฅ
print("\n=== Evaluation Result ===")
print(f"Correct: {result['correct']}")
print(f"Base passed: {result['base_passed']}/{result['base_total']}")
print(f"Plus passed: {result['plus_passed']}/{result['plus_total']}")
print(f"Total passed: {result['passed_tests']}/{result['total_tests']}")
if result['error']:
print(f"Error: {result['error']}")
# ์๋ชป๋ ์๋ฃจ์
์ผ๋ก๋ ํ
์คํธ
wrong_solution = """
def similar_elements(test_tup1, test_tup2):
# ์๋์ ์ผ๋ก ์๋ชป๋ ์๋ฃจ์
- ๊ต์งํฉ์ด ์๋ ํฉ์งํฉ ๋ฐํ
return tuple(set(test_tup1) | set(test_tup2))
"""
print("\n\nTesting with wrong solution:")
print(wrong_solution)
result2 = generator.evaluate_solution(problem, wrong_solution)
print("\n=== Evaluation Result (Wrong Solution) ===")
print(f"Correct: {result2['correct']}")
print(f"Base passed: {result2['base_passed']}/{result2['base_total']}")
print(f"Plus passed: {result2['plus_passed']}/{result2['plus_total']}")
print(f"Total passed: {result2['passed_tests']}/{result2['total_tests']}")
if result2['error']:
print(f"Error: {result2['error']}")
if __name__ == "__main__":
test_evalplus_evaluation() |