|
|
|
""" |
|
EvalPlus ๋ฐฉ์์ผ๋ก ๋จ์ผ MBPP ๋ฌธ์ ํ
์คํธ |
|
""" |
|
|
|
import json |
|
import sys |
|
from pathlib import Path |
|
|
|
|
|
sys.path.append(str(Path(__file__).parent)) |
|
|
|
from absolute_zero_reasoner.testtime.solution_generator import InitialSolutionGenerator |
|
from absolute_zero_reasoner.testtime.config import TestTimeConfig |
|
from absolute_zero_reasoner.testtime.logger import TestTimeLogger |
|
|
|
def load_single_problem(task_id: str = "Mbpp/2"): |
|
"""๋จ์ผ MBPP ๋ฌธ์ ๋ก๋""" |
|
dataset_path = Path("/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/data/MbppPlus.jsonl") |
|
|
|
with open(dataset_path, 'r') as f: |
|
for line in f: |
|
problem = json.loads(line.strip()) |
|
if problem['task_id'] == task_id: |
|
return problem |
|
|
|
raise ValueError(f"Problem {task_id} not found") |
|
|
|
def test_evalplus_evaluation(): |
|
"""EvalPlus ์คํ์ผ ํ๊ฐ ํ
์คํธ""" |
|
|
|
config = TestTimeConfig() |
|
logger = TestTimeLogger() |
|
|
|
|
|
class MockModel: |
|
def device(self): |
|
return 'cpu' |
|
|
|
class MockTokenizer: |
|
eos_token = '</s>' |
|
pad_token = '</s>' |
|
|
|
|
|
generator = InitialSolutionGenerator( |
|
model=MockModel(), |
|
tokenizer=MockTokenizer(), |
|
config=config, |
|
logger=logger, |
|
use_vllm=False |
|
) |
|
|
|
|
|
problem = load_single_problem("Mbpp/2") |
|
print(f"Testing problem: {problem['task_id']}") |
|
print(f"Entry point: {problem['entry_point']}") |
|
print(f"Base inputs: {len(problem.get('base_input', []))}") |
|
print(f"Plus inputs: {len(problem.get('plus_input', []))}") |
|
|
|
|
|
solution = problem['canonical_solution'] |
|
print("\nTesting with canonical solution:") |
|
print(solution[:200] + "..." if len(solution) > 200 else solution) |
|
|
|
|
|
result = generator.evaluate_solution(problem, solution) |
|
|
|
|
|
print("\n=== Evaluation Result ===") |
|
print(f"Correct: {result['correct']}") |
|
print(f"Base passed: {result['base_passed']}/{result['base_total']}") |
|
print(f"Plus passed: {result['plus_passed']}/{result['plus_total']}") |
|
print(f"Total passed: {result['passed_tests']}/{result['total_tests']}") |
|
if result['error']: |
|
print(f"Error: {result['error']}") |
|
|
|
|
|
wrong_solution = """ |
|
def similar_elements(test_tup1, test_tup2): |
|
# ์๋์ ์ผ๋ก ์๋ชป๋ ์๋ฃจ์
- ๊ต์งํฉ์ด ์๋ ํฉ์งํฉ ๋ฐํ |
|
return tuple(set(test_tup1) | set(test_tup2)) |
|
""" |
|
|
|
print("\n\nTesting with wrong solution:") |
|
print(wrong_solution) |
|
|
|
result2 = generator.evaluate_solution(problem, wrong_solution) |
|
|
|
print("\n=== Evaluation Result (Wrong Solution) ===") |
|
print(f"Correct: {result2['correct']}") |
|
print(f"Base passed: {result2['base_passed']}/{result2['base_total']}") |
|
print(f"Plus passed: {result2['plus_passed']}/{result2['plus_total']}") |
|
print(f"Total passed: {result2['passed_tests']}/{result2['total_tests']}") |
|
if result2['error']: |
|
print(f"Error: {result2['error']}") |
|
|
|
if __name__ == "__main__": |
|
test_evalplus_evaluation() |