File size: 3,338 Bytes
24c2665
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python3
"""
EvalPlus ๋ฐฉ์‹์œผ๋กœ ๋‹จ์ผ MBPP ๋ฌธ์ œ ํ…Œ์ŠคํŠธ
"""

import json
import sys
from pathlib import Path

# Add parent directory to path
sys.path.append(str(Path(__file__).parent))

from absolute_zero_reasoner.testtime.solution_generator import InitialSolutionGenerator
from absolute_zero_reasoner.testtime.config import TestTimeConfig
from absolute_zero_reasoner.testtime.logger import TestTimeLogger

def load_single_problem(task_id: str = "Mbpp/2"):
    """๋‹จ์ผ MBPP ๋ฌธ์ œ ๋กœ๋“œ"""
    dataset_path = Path("/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/data/MbppPlus.jsonl")
    
    with open(dataset_path, 'r') as f:
        for line in f:
            problem = json.loads(line.strip())
            if problem['task_id'] == task_id:
                return problem
    
    raise ValueError(f"Problem {task_id} not found")

def test_evalplus_evaluation():
    """EvalPlus ์Šคํƒ€์ผ ํ‰๊ฐ€ ํ…Œ์ŠคํŠธ"""
    # ์„ค์ •
    config = TestTimeConfig()
    logger = TestTimeLogger()
    
    # ๊ฐ„๋‹จํ•œ ๋ชจ์˜ ๋ชจ๋ธ๊ณผ ํ† ํฌ๋‚˜์ด์ € (์‹ค์ œ ํ‰๊ฐ€๋Š” ์•ˆํ•จ)
    class MockModel:
        def device(self):
            return 'cpu'
    
    class MockTokenizer:
        eos_token = '</s>'
        pad_token = '</s>'
    
    # ์†”๋ฃจ์…˜ ์ƒ์„ฑ๊ธฐ ์ดˆ๊ธฐํ™”
    generator = InitialSolutionGenerator(
        model=MockModel(),
        tokenizer=MockTokenizer(),
        config=config,
        logger=logger,
        use_vllm=False
    )
    
    # ๋ฌธ์ œ ๋กœ๋“œ
    problem = load_single_problem("Mbpp/2")
    print(f"Testing problem: {problem['task_id']}")
    print(f"Entry point: {problem['entry_point']}")
    print(f"Base inputs: {len(problem.get('base_input', []))}")
    print(f"Plus inputs: {len(problem.get('plus_input', []))}")
    
    # ์ •๋‹ต ์†”๋ฃจ์…˜์œผ๋กœ ํ…Œ์ŠคํŠธ
    solution = problem['canonical_solution']
    print("\nTesting with canonical solution:")
    print(solution[:200] + "..." if len(solution) > 200 else solution)
    
    # ํ‰๊ฐ€ ์‹คํ–‰
    result = generator.evaluate_solution(problem, solution)
    
    # ๊ฒฐ๊ณผ ์ถœ๋ ฅ
    print("\n=== Evaluation Result ===")
    print(f"Correct: {result['correct']}")
    print(f"Base passed: {result['base_passed']}/{result['base_total']}")
    print(f"Plus passed: {result['plus_passed']}/{result['plus_total']}")
    print(f"Total passed: {result['passed_tests']}/{result['total_tests']}")
    if result['error']:
        print(f"Error: {result['error']}")
    
    # ์ž˜๋ชป๋œ ์†”๋ฃจ์…˜์œผ๋กœ๋„ ํ…Œ์ŠคํŠธ
    wrong_solution = """
def similar_elements(test_tup1, test_tup2):
    # ์˜๋„์ ์œผ๋กœ ์ž˜๋ชป๋œ ์†”๋ฃจ์…˜ - ๊ต์ง‘ํ•ฉ์ด ์•„๋‹Œ ํ•ฉ์ง‘ํ•ฉ ๋ฐ˜ํ™˜
    return tuple(set(test_tup1) | set(test_tup2))
"""
    
    print("\n\nTesting with wrong solution:")
    print(wrong_solution)
    
    result2 = generator.evaluate_solution(problem, wrong_solution)
    
    print("\n=== Evaluation Result (Wrong Solution) ===")
    print(f"Correct: {result2['correct']}")
    print(f"Base passed: {result2['base_passed']}/{result2['base_total']}")
    print(f"Plus passed: {result2['plus_passed']}/{result2['plus_total']}")
    print(f"Total passed: {result2['passed_tests']}/{result2['total_tests']}")
    if result2['error']:
        print(f"Error: {result2['error']}")

if __name__ == "__main__":
    test_evalplus_evaluation()