hjkim00's picture
Restore all essential files - code, configs, and MBPP/HumanEval data
24c2665 verified
raw
history blame
16.8 kB
"""
TestTime RLVR ํ”„๋กฌํ”„ํŠธ ์ค‘์•™ ๊ด€๋ฆฌ ์‹œ์Šคํ…œ
๋ชจ๋“  ํ”„๋กฌํ”„ํŠธ๋ฅผ ํ•œ ๊ณณ์—์„œ ๊ด€๋ฆฌํ•˜์—ฌ ์ผ๊ด€์„ฑ๊ณผ ์œ ์ง€๋ณด์ˆ˜์„ฑ์„ ํ–ฅ์ƒ์‹œํ‚ต๋‹ˆ๋‹ค.
"""
from typing import Dict, List, Any
from dataclasses import dataclass
from enum import Enum
class PromptType(Enum):
"""ํ”„๋กฌํ”„ํŠธ ์œ ํ˜• ์ •์˜"""
SOLUTION_GENERATION = "solution_generation"
DIVERSE_GENERATION = "diverse_generation"
INPUT_GENERATION = "input_generation"
TASK_GENERATION = "task_generation"
TASK_EVALUATION = "task_evaluation"
class BenchmarkType(Enum):
"""๋ฒค์น˜๋งˆํฌ ์œ ํ˜• ์ •์˜"""
HUMANEVAL = "humaneval"
MBPP = "mbpp"
GENERAL = "general"
@dataclass
class PromptTemplate:
"""ํ”„๋กฌํ”„ํŠธ ํ…œํ”Œ๋ฆฟ ๋ฐ์ดํ„ฐ ํด๋ž˜์Šค"""
name: str
template: str
description: str
benchmark: BenchmarkType
temperature: float = 0.05
variables: List[str] = None
def __post_init__(self):
if self.variables is None:
self.variables = []
class PromptManager:
"""ํ”„๋กฌํ”„ํŠธ ์ค‘์•™ ๊ด€๋ฆฌ ํด๋ž˜์Šค"""
def __init__(self):
self.prompts = self._initialize_prompts()
def _initialize_prompts(self) -> Dict[str, PromptTemplate]:
"""๋ชจ๋“  ํ”„๋กฌํ”„ํŠธ ํ…œํ”Œ๋ฆฟ ์ดˆ๊ธฐํ™”"""
prompts = {}
# ================================================================================
# 1. SOLUTION GENERATION PROMPTS (Current Evaluation - ๋ฒ ์ด์Šค๋ผ์ธ)
# ================================================================================
# HumanEval ๊ธฐ๋ณธ ์†”๋ฃจ์…˜ ์ƒ์„ฑ
prompts["solution_humaneval_basic"] = PromptTemplate(
name="HumanEval ๊ธฐ๋ณธ ์†”๋ฃจ์…˜ ์ƒ์„ฑ",
benchmark=BenchmarkType.HUMANEVAL,
temperature=0.05,
description="HumanEval ๋ฌธ์ œ์— ๋Œ€ํ•œ ๊ธฐ๋ณธ ์†”๋ฃจ์…˜ ์ƒ์„ฑ (greedy)",
variables=["problem_prompt"],
template="""You are a Python writing assistant. Complete the following Python function.
{problem_prompt}
Please provide a complete implementation of the function."""
)
# HumanEval ๋‹ค์ค‘ ํ•จ์ˆ˜ ์ฒ˜๋ฆฌ
prompts["solution_humaneval_multi"] = PromptTemplate(
name="HumanEval ๋‹ค์ค‘ ํ•จ์ˆ˜ ์†”๋ฃจ์…˜ ์ƒ์„ฑ",
benchmark=BenchmarkType.HUMANEVAL,
temperature=0.05,
description="์—ฌ๋Ÿฌ ํ•จ์ˆ˜๊ฐ€ ์žˆ๋Š” HumanEval ๋ฌธ์ œ ์ฒ˜๋ฆฌ",
variables=["problem_prompt", "entry_point"],
template="""You are a Python writing assistant. Complete the following Python function.
{problem_prompt}
Please provide ONLY the implementation for the function `{entry_point}`.
Complete the body of the `{entry_point}` function where it is incomplete.
Do not modify or reimplement other functions that are already complete."""
)
# MBPP ๊ธฐ๋ณธ ์†”๋ฃจ์…˜ ์ƒ์„ฑ
prompts["solution_mbpp_basic"] = PromptTemplate(
name="MBPP ๊ธฐ๋ณธ ์†”๋ฃจ์…˜ ์ƒ์„ฑ",
benchmark=BenchmarkType.MBPP,
temperature=0.05,
description="MBPP ๋ฌธ์ œ์— ๋Œ€ํ•œ ๊ธฐ๋ณธ ์†”๋ฃจ์…˜ ์ƒ์„ฑ",
variables=["problem_prompt"],
template="""
Please generate a complete, self-contained Python script that solves the following problem.
CRITICAL REQUIREMENTS:
- You MUST maintain the EXACT function signature as shown in the examples
- The function name, parameter names, parameter types, and parameter count MUST match exactly with the examples
- Look at the assert statements carefully to understand the expected function signature
- DO NOT change the number of parameters or their types from what is shown in the examples
Instructions:
- Wrap the entire script in a Markdown code block with syntax highlighting (```python ... ```).
- For each function, include a concise docstring enclosed in triple single quotes (''' ... '''), placed immediately below the def line.
The docstring should briefly describe:
โ€ข The function's purpose
โ€ข Input parameters
โ€ข Return value
Problem statement:
{problem_prompt}
"""
)
# ================================================================================
# 2. DIVERSE GENERATION PROMPTS (๋‹ค์–‘ํ•œ ํ”„๋กœ๊ทธ๋žจ ์ƒ์„ฑ)
# ================================================================================
# HumanEval ๋‹ค์–‘์„ฑ ์†”๋ฃจ์…˜
prompts["diverse_humaneval_basic"] = PromptTemplate(
name="HumanEval ๋‹ค์–‘์„ฑ ์†”๋ฃจ์…˜ ์ƒ์„ฑ",
benchmark=BenchmarkType.HUMANEVAL,
temperature=0.7,
description="HumanEval ๋ฌธ์ œ์— ๋Œ€ํ•œ ๋‹ค์–‘ํ•œ ์ ‘๊ทผ๋ฒ• ์†”๋ฃจ์…˜",
variables=["diversity_instruction", "problem_prompt"],
template="""You are a Python writing assistant. {diversity_instruction}
{problem_prompt}
Please provide a complete implementation of the function."""
)
# HumanEval ๋‹ค์–‘์„ฑ ๋‹ค์ค‘ ํ•จ์ˆ˜
prompts["diverse_humaneval_multi"] = PromptTemplate(
name="HumanEval ๋‹ค์–‘์„ฑ ๋‹ค์ค‘ ํ•จ์ˆ˜ ์†”๋ฃจ์…˜",
benchmark=BenchmarkType.HUMANEVAL,
temperature=0.7,
description="๋‹ค์ค‘ ํ•จ์ˆ˜ HumanEval์— ๋Œ€ํ•œ ๋‹ค์–‘์„ฑ ์†”๋ฃจ์…˜",
variables=["diversity_instruction", "problem_prompt", "entry_point"],
template="""You are a Python writing assistant. {diversity_instruction}
{problem_prompt}
Please provide ONLY the implementation for the function `{entry_point}`.
Complete the body of the `{entry_point}` function where it is incomplete.
Do not modify or reimplement other functions that are already complete."""
)
# MBPP ๋‹ค์–‘์„ฑ ์†”๋ฃจ์…˜
prompts["diverse_mbpp_basic"] = PromptTemplate(
name="MBPP ๋‹ค์–‘์„ฑ ์†”๋ฃจ์…˜ ์ƒ์„ฑ",
benchmark=BenchmarkType.MBPP,
temperature=0.7,
description="MBPP ๋ฌธ์ œ์— ๋Œ€ํ•œ ๋‹ค์–‘ํ•œ ์ ‘๊ทผ๋ฒ• ์†”๋ฃจ์…˜",
variables=["diversity_instruction", "problem_prompt"],
template="""Please generate a complete, self-contained Python script that solves the following problem.
CRITICAL REQUIREMENTS:
- You MUST maintain the EXACT function signature as shown in the examples
- The function name, parameter names, parameter types, and parameter count MUST match exactly with the examples
- Look at the assert statements carefully to understand the expected function signature
- DO NOT change the number of parameters or their types from what is shown in the examples
Instructions:
- Wrap the entire script in a Markdown code block with syntax highlighting (```python ... ```).
- For each function, include a concise docstring enclosed in triple single quotes (''' ... '''), placed immediately below the def line.
The docstring should briefly describe:
โ€ข The function's purpose
โ€ข Input parameters
โ€ข Return value
{diversity_instruction}
Problem statement:
{problem_prompt}
"""
)
# ================================================================================
# 3. INPUT GENERATION PROMPTS (์ž…๋ ฅ ์ฆ๊ฐ•)
# ================================================================================
prompts["input_generation_basic"] = PromptTemplate(
name="๊ธฐ๋ณธ ์ž…๋ ฅ ์ƒ์„ฑ",
benchmark=BenchmarkType.GENERAL,
temperature=0.5,
description="๊ธฐ์กด IPO ์˜ˆ์ œ๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ์ƒˆ๋กœ์šด ์ž…๋ ฅ ์ƒ์„ฑ",
variables=["problem_description", "existing_examples", "full_code", "arg_type_info"],
template="""Given the following problem description and its Python function implementation, first analyze the types and valid ranges of the function arguments, then write **5 different example inputs** for the function that cover a diverse mix of typical (general) cases and edge/boundary cases.
Problem Description:
'''
{problem_description}
'''
Existing Examples from Problem:
{existing_examples}
Function Implementation:
```python
{full_code}
```
{arg_type_info}
Based on the existing examples above, generate 5 NEW diverse test inputs that are different from the existing ones. Each input should be a Python dict where:
- Keys are the exact parameter names from the function signature
- Values are appropriate test values for each parameter
Format your response as:
```python
examples = [
{{dict_with_all_function_parameters}}, # Description of this test case
{{dict_with_all_function_parameters}}, # Description of this test case
... # Continue for all 5 examples
]
```
Ensure your examples include:
- At least 2 typical/general cases
- At least 2 edge/boundary cases
- 1 special case (empty, zero, maximum values, etc.)
- All examples should be DIFFERENT from the existing examples shown above"""
)
# ================================================================================
# 4. TASK GENERATION PROMPTS (IPO โ†’ ์ถ”๋ก  ํƒœ์Šคํฌ)
# ================================================================================
prompts["task_induction"] = PromptTemplate(
name="Induction ํƒœ์Šคํฌ ์ƒ์„ฑ (AZR code_f)",
benchmark=BenchmarkType.GENERAL,
temperature=0.05,
description="์ฃผ์–ด์ง„ ์ž…๋ ฅ-์ถœ๋ ฅ์œผ๋กœ๋ถ€ํ„ฐ ํ”„๋กœ๊ทธ๋žจ ์ถ”๋ก  (AZR ์›๋ณธ)",
variables=["input_output_pairs", "message"],
template="""A conversation between User and Assistant.
The User provides a set of input/output pairs and a message describing the hidden function. The Assistant must:
1. **Privately think step-by-step** about how to reconstruct the general function based on the provided examples.
2. **Output exactly one** `<think>...</think>` block containing the full reasoning process.
3. **Then output exactly one** `<answer>...</answer>` block containing **only** the Python code snippet defining the function `f`โ€”no labels, no comments, no extra text.
4. **Do not** generate any text outside these two blocks.
5. Follow to the **code requirements** and **formatting rules**.
# Code Requirements:
- Name the entry function `f` (e.g., `def f(...): ...`), you may include nested definitions inside `f`.
- Ensure the function returns a value.
- Include at least one input parameter.
- Make the function deterministic.
- AVOID the FOLLOWING:
* Random functions or variables
* Date/time operations
* I/O operations (reading files, network requests)
* Printing or logging
* Any external state
- Ensure execution completes within 10 seconds on a modern CPU.
- All imports and custom class definitions must be at the very top of the code snippet.
- The snippet must end with a return statement from the main function `f`; anything after will be removed.
User:
# Input and Output Pairs:
{input_output_pairs}
# Message:
{message}"""
)
prompts["task_deduction"] = PromptTemplate(
name="Deduction ํƒœ์Šคํฌ ์ƒ์„ฑ (AZR code_o)",
benchmark=BenchmarkType.GENERAL,
temperature=0.05,
description="์ฃผ์–ด์ง„ ํ”„๋กœ๊ทธ๋žจ๊ณผ ์ž…๋ ฅ์œผ๋กœ๋ถ€ํ„ฐ ์ถœ๋ ฅ ์ถ”๋ก  (AZR ์›๋ณธ)",
variables=["snippet", "input_args"],
template="""A conversation between User and Assistant.
The User provides a Python code snippet and specific input values. The Assistant must:
1. **Privately think step-by-step** about how the code executes with the given inputs.
2. **Output exactly one** `<think>...</think>` block containing your full reasoning.
3. **Then output exactly one** `<answer>...</answer>` block containing **only** the output valuesโ€”no labels, no comments, no extra text.
4. **Do not** generate any text outside these two blocks.
5. Adhere to the **output rules**.
# Output Rules:
- If the output is a string, wrap it in quotes.
- For dicts, lists, and other literals, use valid Python literal notation.
User:
# Python Code Snippet:
{snippet}
# Input:
{input_args}"""
)
prompts["task_abduction"] = PromptTemplate(
name="Abduction ํƒœ์Šคํฌ ์ƒ์„ฑ (AZR code_i)",
benchmark=BenchmarkType.GENERAL,
temperature=0.05,
description="์ฃผ์–ด์ง„ ํ”„๋กœ๊ทธ๋žจ๊ณผ ์ถœ๋ ฅ์œผ๋กœ๋ถ€ํ„ฐ ์ž…๋ ฅ ์ถ”๋ก  (AZR ์›๋ณธ)",
variables=["snippet", "output"],
template="""A conversation between User and Assistant.
The User provides a Python code snippet and its observed output. The Assistant must:
1. **Privately think step-by-step** about which input produces that output.
2. **Output exactly one** `<think>...</think>` block containing your full reasoning.
3. **Then output exactly one** `<answer>...</answer>` block containing **only** the input valuesโ€”no labels, no comments, no extra text.
4. **Do not** generate any text outside these two blocks.
5. Adhere to the **input rules**.
# Input Rules:
- If an argument is a string, wrap it in quotes.
- For multiple arguments, separate by commas.
- Use Python literal notation for lists, dicts, tuples.
- Boolean values must be `True` or `False`.
User:
# Python Code Snippet:
{snippet}
# Observed Output:
{output}"""
)
# ================================================================================
# 5. TASK EVALUATION PROMPTS (LLM ํƒœ์Šคํฌ ์‘๋‹ต)
# ================================================================================
prompts["task_evaluation_basic"] = PromptTemplate(
name="๊ธฐ๋ณธ ํƒœ์Šคํฌ ํ‰๊ฐ€",
benchmark=BenchmarkType.GENERAL,
temperature=0.05,
description="์ƒ์„ฑ๋œ ์ถ”๋ก  ํƒœ์Šคํฌ์— ๋Œ€ํ•œ LLM ์‘๋‹ต",
variables=["task_prompt"],
template="{task_prompt}"
)
return prompts
def get_prompt(self, prompt_key: str, **kwargs) -> str:
"""ํ”„๋กฌํ”„ํŠธ ํ‚ค๋กœ ํ…œํ”Œ๋ฆฟ์„ ๊ฐ€์ ธ์™€ ๋ณ€์ˆ˜๋ฅผ ์ฑ„์›€"""
if prompt_key not in self.prompts:
raise ValueError(f"Unknown prompt key: {prompt_key}")
template = self.prompts[prompt_key]
# ํ•„์ˆ˜ ๋ณ€์ˆ˜ ํ™•์ธ
missing_vars = []
for var in template.variables:
if var not in kwargs:
missing_vars.append(var)
if missing_vars:
raise ValueError(f"Missing required variables for prompt '{prompt_key}': {missing_vars}")
# ํ…œํ”Œ๋ฆฟ ํฌ๋งทํŒ…
try:
return template.template.format(**kwargs)
except KeyError as e:
raise ValueError(f"Template formatting error for prompt '{prompt_key}': {e}")
def get_temperature(self, prompt_key: str) -> float:
"""ํ”„๋กฌํ”„ํŠธ์˜ ๊ถŒ์žฅ temperature ๋ฐ˜ํ™˜"""
if prompt_key not in self.prompts:
raise ValueError(f"Unknown prompt key: {prompt_key}")
return self.prompts[prompt_key].temperature
def get_diversity_instruction(self, variation_id: int) -> str:
"""variation_id์— ๋”ฐ๋ฅธ ๋‹ค์–‘์„ฑ ์ง€์‹œ๋ฌธ ๋ฐ˜ํ™˜"""
diversity_instructions = [
"", # ๊ธฐ๋ณธ
"",
"",
""
]
# diversity_instructions = [
# "", # ๊ธฐ๋ณธ
# "Implement this in a robust way that works well for various examples",
# "Provide an alternative solution with a unique implementation style:",
# "Try to implement using a different approach, algorithm, or coding style than typical solutions."
# ]
return diversity_instructions[variation_id % len(diversity_instructions)]
def list_prompts(self) -> Dict[str, PromptTemplate]:
"""๋ชจ๋“  ํ”„๋กฌํ”„ํŠธ ํ…œํ”Œ๋ฆฟ ๋ชฉ๋ก ๋ฐ˜ํ™˜"""
return self.prompts.copy()
def get_prompts_by_type(self, benchmark: BenchmarkType) -> Dict[str, PromptTemplate]:
"""๋ฒค์น˜๋งˆํฌ ํƒ€์ž…๋ณ„ ํ”„๋กฌํ”„ํŠธ ๋ฐ˜ํ™˜"""
return {
key: template for key, template in self.prompts.items()
if template.benchmark == benchmark or template.benchmark == BenchmarkType.GENERAL
}
# ์ „์—ญ ํ”„๋กฌํ”„ํŠธ ๋งค๋‹ˆ์ € ์ธ์Šคํ„ด์Šค
prompt_manager = PromptManager()
# ํŽธ์˜ ํ•จ์ˆ˜๋“ค
def get_prompt(prompt_key: str, **kwargs) -> str:
"""ํ”„๋กฌํ”„ํŠธ ๊ฐ€์ ธ์˜ค๊ธฐ ํŽธ์˜ ํ•จ์ˆ˜"""
return prompt_manager.get_prompt(prompt_key, **kwargs)
def get_temperature(prompt_key: str) -> float:
"""ํ”„๋กฌํ”„ํŠธ temperature ๊ฐ€์ ธ์˜ค๊ธฐ ํŽธ์˜ ํ•จ์ˆ˜"""
return prompt_manager.get_temperature(prompt_key)
def get_diversity_instruction(variation_id: int) -> str:
"""๋‹ค์–‘์„ฑ ์ง€์‹œ๋ฌธ ๊ฐ€์ ธ์˜ค๊ธฐ ํŽธ์˜ ํ•จ์ˆ˜"""
return prompt_manager.get_diversity_instruction(variation_id)