|
|
""" |
|
|
TestTime RLVR ํ๋กฌํํธ ์ค์ ๊ด๋ฆฌ ์์คํ
|
|
|
|
|
|
๋ชจ๋ ํ๋กฌํํธ๋ฅผ ํ ๊ณณ์์ ๊ด๋ฆฌํ์ฌ ์ผ๊ด์ฑ๊ณผ ์ ์ง๋ณด์์ฑ์ ํฅ์์ํต๋๋ค. |
|
|
""" |
|
|
|
|
|
from typing import Dict, List, Any |
|
|
from dataclasses import dataclass |
|
|
from enum import Enum |
|
|
|
|
|
|
|
|
class PromptType(Enum): |
|
|
"""ํ๋กฌํํธ ์ ํ ์ ์""" |
|
|
SOLUTION_GENERATION = "solution_generation" |
|
|
DIVERSE_GENERATION = "diverse_generation" |
|
|
INPUT_GENERATION = "input_generation" |
|
|
TASK_GENERATION = "task_generation" |
|
|
TASK_EVALUATION = "task_evaluation" |
|
|
|
|
|
|
|
|
class BenchmarkType(Enum): |
|
|
"""๋ฒค์น๋งํฌ ์ ํ ์ ์""" |
|
|
HUMANEVAL = "humaneval" |
|
|
MBPP = "mbpp" |
|
|
GENERAL = "general" |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class PromptTemplate: |
|
|
"""ํ๋กฌํํธ ํ
ํ๋ฆฟ ๋ฐ์ดํฐ ํด๋์ค""" |
|
|
name: str |
|
|
template: str |
|
|
description: str |
|
|
benchmark: BenchmarkType |
|
|
temperature: float = 0.05 |
|
|
variables: List[str] = None |
|
|
|
|
|
def __post_init__(self): |
|
|
if self.variables is None: |
|
|
self.variables = [] |
|
|
|
|
|
|
|
|
class PromptManager: |
|
|
"""ํ๋กฌํํธ ์ค์ ๊ด๋ฆฌ ํด๋์ค""" |
|
|
|
|
|
def __init__(self): |
|
|
self.prompts = self._initialize_prompts() |
|
|
|
|
|
def _initialize_prompts(self) -> Dict[str, PromptTemplate]: |
|
|
"""๋ชจ๋ ํ๋กฌํํธ ํ
ํ๋ฆฟ ์ด๊ธฐํ""" |
|
|
|
|
|
prompts = {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
prompts["solution_humaneval_basic"] = PromptTemplate( |
|
|
name="HumanEval ๊ธฐ๋ณธ ์๋ฃจ์
์์ฑ", |
|
|
benchmark=BenchmarkType.HUMANEVAL, |
|
|
temperature=0.05, |
|
|
description="HumanEval ๋ฌธ์ ์ ๋ํ ๊ธฐ๋ณธ ์๋ฃจ์
์์ฑ (greedy)", |
|
|
variables=["problem_prompt"], |
|
|
template="""You are a Python writing assistant. Complete the following Python function. |
|
|
|
|
|
{problem_prompt} |
|
|
|
|
|
Please provide a complete implementation of the function.""" |
|
|
) |
|
|
|
|
|
|
|
|
prompts["solution_humaneval_multi"] = PromptTemplate( |
|
|
name="HumanEval ๋ค์ค ํจ์ ์๋ฃจ์
์์ฑ", |
|
|
benchmark=BenchmarkType.HUMANEVAL, |
|
|
temperature=0.05, |
|
|
description="์ฌ๋ฌ ํจ์๊ฐ ์๋ HumanEval ๋ฌธ์ ์ฒ๋ฆฌ", |
|
|
variables=["problem_prompt", "entry_point"], |
|
|
template="""You are a Python writing assistant. Complete the following Python function. |
|
|
|
|
|
{problem_prompt} |
|
|
|
|
|
Please provide ONLY the implementation for the function `{entry_point}`. |
|
|
Complete the body of the `{entry_point}` function where it is incomplete. |
|
|
Do not modify or reimplement other functions that are already complete.""" |
|
|
) |
|
|
|
|
|
|
|
|
prompts["solution_mbpp_basic"] = PromptTemplate( |
|
|
name="MBPP ๊ธฐ๋ณธ ์๋ฃจ์
์์ฑ", |
|
|
benchmark=BenchmarkType.MBPP, |
|
|
temperature=0.05, |
|
|
description="MBPP ๋ฌธ์ ์ ๋ํ ๊ธฐ๋ณธ ์๋ฃจ์
์์ฑ", |
|
|
variables=["problem_prompt"], |
|
|
template=""" |
|
|
Please generate a complete, self-contained Python script that solves the following problem. |
|
|
|
|
|
CRITICAL REQUIREMENTS: |
|
|
- You MUST maintain the EXACT function signature as shown in the examples |
|
|
- The function name, parameter names, parameter types, and parameter count MUST match exactly with the examples |
|
|
- Look at the assert statements carefully to understand the expected function signature |
|
|
- DO NOT change the number of parameters or their types from what is shown in the examples |
|
|
|
|
|
Instructions: |
|
|
- Wrap the entire script in a Markdown code block with syntax highlighting (```python ... ```). |
|
|
- For each function, include a concise docstring enclosed in triple single quotes (''' ... '''), placed immediately below the def line. |
|
|
The docstring should briefly describe: |
|
|
โข The function's purpose |
|
|
โข Input parameters |
|
|
โข Return value |
|
|
|
|
|
Problem statement: |
|
|
{problem_prompt} |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
prompts["diverse_humaneval_basic"] = PromptTemplate( |
|
|
name="HumanEval ๋ค์์ฑ ์๋ฃจ์
์์ฑ", |
|
|
benchmark=BenchmarkType.HUMANEVAL, |
|
|
temperature=0.7, |
|
|
description="HumanEval ๋ฌธ์ ์ ๋ํ ๋ค์ํ ์ ๊ทผ๋ฒ ์๋ฃจ์
", |
|
|
variables=["diversity_instruction", "problem_prompt"], |
|
|
template="""You are a Python writing assistant. {diversity_instruction} |
|
|
|
|
|
{problem_prompt} |
|
|
|
|
|
Please provide a complete implementation of the function.""" |
|
|
) |
|
|
|
|
|
|
|
|
prompts["diverse_humaneval_multi"] = PromptTemplate( |
|
|
name="HumanEval ๋ค์์ฑ ๋ค์ค ํจ์ ์๋ฃจ์
", |
|
|
benchmark=BenchmarkType.HUMANEVAL, |
|
|
temperature=0.7, |
|
|
description="๋ค์ค ํจ์ HumanEval์ ๋ํ ๋ค์์ฑ ์๋ฃจ์
", |
|
|
variables=["diversity_instruction", "problem_prompt", "entry_point"], |
|
|
template="""You are a Python writing assistant. {diversity_instruction} |
|
|
|
|
|
{problem_prompt} |
|
|
|
|
|
Please provide ONLY the implementation for the function `{entry_point}`. |
|
|
Complete the body of the `{entry_point}` function where it is incomplete. |
|
|
Do not modify or reimplement other functions that are already complete.""" |
|
|
) |
|
|
|
|
|
|
|
|
prompts["diverse_mbpp_basic"] = PromptTemplate( |
|
|
name="MBPP ๋ค์์ฑ ์๋ฃจ์
์์ฑ", |
|
|
benchmark=BenchmarkType.MBPP, |
|
|
temperature=0.7, |
|
|
description="MBPP ๋ฌธ์ ์ ๋ํ ๋ค์ํ ์ ๊ทผ๋ฒ ์๋ฃจ์
", |
|
|
variables=["diversity_instruction", "problem_prompt"], |
|
|
template="""Please generate a complete, self-contained Python script that solves the following problem. |
|
|
|
|
|
CRITICAL REQUIREMENTS: |
|
|
- You MUST maintain the EXACT function signature as shown in the examples |
|
|
- The function name, parameter names, parameter types, and parameter count MUST match exactly with the examples |
|
|
- Look at the assert statements carefully to understand the expected function signature |
|
|
- DO NOT change the number of parameters or their types from what is shown in the examples |
|
|
|
|
|
Instructions: |
|
|
- Wrap the entire script in a Markdown code block with syntax highlighting (```python ... ```). |
|
|
- For each function, include a concise docstring enclosed in triple single quotes (''' ... '''), placed immediately below the def line. |
|
|
The docstring should briefly describe: |
|
|
โข The function's purpose |
|
|
โข Input parameters |
|
|
โข Return value |
|
|
|
|
|
{diversity_instruction} |
|
|
|
|
|
Problem statement: |
|
|
{problem_prompt} |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
prompts["input_generation_basic"] = PromptTemplate( |
|
|
name="๊ธฐ๋ณธ ์
๋ ฅ ์์ฑ", |
|
|
benchmark=BenchmarkType.GENERAL, |
|
|
temperature=0.5, |
|
|
description="๊ธฐ์กด IPO ์์ ๋ฅผ ๋ฐํ์ผ๋ก ์๋ก์ด ์
๋ ฅ ์์ฑ", |
|
|
variables=["problem_description", "existing_examples", "full_code", "arg_type_info"], |
|
|
template="""Given the following problem description and its Python function implementation, first analyze the types and valid ranges of the function arguments, then write **5 different example inputs** for the function that cover a diverse mix of typical (general) cases and edge/boundary cases. |
|
|
|
|
|
Problem Description: |
|
|
''' |
|
|
{problem_description} |
|
|
''' |
|
|
|
|
|
Existing Examples from Problem: |
|
|
{existing_examples} |
|
|
|
|
|
Function Implementation: |
|
|
```python |
|
|
{full_code} |
|
|
``` |
|
|
|
|
|
{arg_type_info} |
|
|
|
|
|
Based on the existing examples above, generate 5 NEW diverse test inputs that are different from the existing ones. Each input should be a Python dict where: |
|
|
- Keys are the exact parameter names from the function signature |
|
|
- Values are appropriate test values for each parameter |
|
|
|
|
|
Format your response as: |
|
|
```python |
|
|
examples = [ |
|
|
{{dict_with_all_function_parameters}}, # Description of this test case |
|
|
{{dict_with_all_function_parameters}}, # Description of this test case |
|
|
... # Continue for all 5 examples |
|
|
] |
|
|
``` |
|
|
|
|
|
Ensure your examples include: |
|
|
- At least 2 typical/general cases |
|
|
- At least 2 edge/boundary cases |
|
|
- 1 special case (empty, zero, maximum values, etc.) |
|
|
- All examples should be DIFFERENT from the existing examples shown above""" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
prompts["task_induction"] = PromptTemplate( |
|
|
name="Induction ํ์คํฌ ์์ฑ (AZR code_f)", |
|
|
benchmark=BenchmarkType.GENERAL, |
|
|
temperature=0.05, |
|
|
description="์ฃผ์ด์ง ์
๋ ฅ-์ถ๋ ฅ์ผ๋ก๋ถํฐ ํ๋ก๊ทธ๋จ ์ถ๋ก (AZR ์๋ณธ)", |
|
|
variables=["input_output_pairs", "message"], |
|
|
template="""A conversation between User and Assistant. |
|
|
The User provides a set of input/output pairs and a message describing the hidden function. The Assistant must: |
|
|
1. **Privately think step-by-step** about how to reconstruct the general function based on the provided examples. |
|
|
2. **Output exactly one** `<think>...</think>` block containing the full reasoning process. |
|
|
3. **Then output exactly one** `<answer>...</answer>` block containing **only** the Python code snippet defining the function `f`โno labels, no comments, no extra text. |
|
|
4. **Do not** generate any text outside these two blocks. |
|
|
5. Follow to the **code requirements** and **formatting rules**. |
|
|
|
|
|
# Code Requirements: |
|
|
- Name the entry function `f` (e.g., `def f(...): ...`), you may include nested definitions inside `f`. |
|
|
- Ensure the function returns a value. |
|
|
- Include at least one input parameter. |
|
|
- Make the function deterministic. |
|
|
- AVOID the FOLLOWING: |
|
|
* Random functions or variables |
|
|
* Date/time operations |
|
|
* I/O operations (reading files, network requests) |
|
|
* Printing or logging |
|
|
* Any external state |
|
|
- Ensure execution completes within 10 seconds on a modern CPU. |
|
|
- All imports and custom class definitions must be at the very top of the code snippet. |
|
|
- The snippet must end with a return statement from the main function `f`; anything after will be removed. |
|
|
|
|
|
User: |
|
|
# Input and Output Pairs: |
|
|
{input_output_pairs} |
|
|
|
|
|
# Message: |
|
|
{message}""" |
|
|
) |
|
|
|
|
|
prompts["task_deduction"] = PromptTemplate( |
|
|
name="Deduction ํ์คํฌ ์์ฑ (AZR code_o)", |
|
|
benchmark=BenchmarkType.GENERAL, |
|
|
temperature=0.05, |
|
|
description="์ฃผ์ด์ง ํ๋ก๊ทธ๋จ๊ณผ ์
๋ ฅ์ผ๋ก๋ถํฐ ์ถ๋ ฅ ์ถ๋ก (AZR ์๋ณธ)", |
|
|
variables=["snippet", "input_args"], |
|
|
template="""A conversation between User and Assistant. |
|
|
The User provides a Python code snippet and specific input values. The Assistant must: |
|
|
1. **Privately think step-by-step** about how the code executes with the given inputs. |
|
|
2. **Output exactly one** `<think>...</think>` block containing your full reasoning. |
|
|
3. **Then output exactly one** `<answer>...</answer>` block containing **only** the output valuesโno labels, no comments, no extra text. |
|
|
4. **Do not** generate any text outside these two blocks. |
|
|
5. Adhere to the **output rules**. |
|
|
|
|
|
# Output Rules: |
|
|
- If the output is a string, wrap it in quotes. |
|
|
- For dicts, lists, and other literals, use valid Python literal notation. |
|
|
|
|
|
User: |
|
|
# Python Code Snippet: |
|
|
{snippet} |
|
|
|
|
|
# Input: |
|
|
{input_args}""" |
|
|
) |
|
|
|
|
|
prompts["task_abduction"] = PromptTemplate( |
|
|
name="Abduction ํ์คํฌ ์์ฑ (AZR code_i)", |
|
|
benchmark=BenchmarkType.GENERAL, |
|
|
temperature=0.05, |
|
|
description="์ฃผ์ด์ง ํ๋ก๊ทธ๋จ๊ณผ ์ถ๋ ฅ์ผ๋ก๋ถํฐ ์
๋ ฅ ์ถ๋ก (AZR ์๋ณธ)", |
|
|
variables=["snippet", "output"], |
|
|
template="""A conversation between User and Assistant. |
|
|
The User provides a Python code snippet and its observed output. The Assistant must: |
|
|
1. **Privately think step-by-step** about which input produces that output. |
|
|
2. **Output exactly one** `<think>...</think>` block containing your full reasoning. |
|
|
3. **Then output exactly one** `<answer>...</answer>` block containing **only** the input valuesโno labels, no comments, no extra text. |
|
|
4. **Do not** generate any text outside these two blocks. |
|
|
5. Adhere to the **input rules**. |
|
|
|
|
|
# Input Rules: |
|
|
- If an argument is a string, wrap it in quotes. |
|
|
- For multiple arguments, separate by commas. |
|
|
- Use Python literal notation for lists, dicts, tuples. |
|
|
- Boolean values must be `True` or `False`. |
|
|
|
|
|
User: |
|
|
# Python Code Snippet: |
|
|
{snippet} |
|
|
|
|
|
# Observed Output: |
|
|
{output}""" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
prompts["task_evaluation_basic"] = PromptTemplate( |
|
|
name="๊ธฐ๋ณธ ํ์คํฌ ํ๊ฐ", |
|
|
benchmark=BenchmarkType.GENERAL, |
|
|
temperature=0.05, |
|
|
description="์์ฑ๋ ์ถ๋ก ํ์คํฌ์ ๋ํ LLM ์๋ต", |
|
|
variables=["task_prompt"], |
|
|
template="{task_prompt}" |
|
|
) |
|
|
|
|
|
return prompts |
|
|
|
|
|
def get_prompt(self, prompt_key: str, **kwargs) -> str: |
|
|
"""ํ๋กฌํํธ ํค๋ก ํ
ํ๋ฆฟ์ ๊ฐ์ ธ์ ๋ณ์๋ฅผ ์ฑ์""" |
|
|
if prompt_key not in self.prompts: |
|
|
raise ValueError(f"Unknown prompt key: {prompt_key}") |
|
|
|
|
|
template = self.prompts[prompt_key] |
|
|
|
|
|
|
|
|
missing_vars = [] |
|
|
for var in template.variables: |
|
|
if var not in kwargs: |
|
|
missing_vars.append(var) |
|
|
|
|
|
if missing_vars: |
|
|
raise ValueError(f"Missing required variables for prompt '{prompt_key}': {missing_vars}") |
|
|
|
|
|
|
|
|
try: |
|
|
return template.template.format(**kwargs) |
|
|
except KeyError as e: |
|
|
raise ValueError(f"Template formatting error for prompt '{prompt_key}': {e}") |
|
|
|
|
|
def get_temperature(self, prompt_key: str) -> float: |
|
|
"""ํ๋กฌํํธ์ ๊ถ์ฅ temperature ๋ฐํ""" |
|
|
if prompt_key not in self.prompts: |
|
|
raise ValueError(f"Unknown prompt key: {prompt_key}") |
|
|
return self.prompts[prompt_key].temperature |
|
|
|
|
|
def get_diversity_instruction(self, variation_id: int) -> str: |
|
|
"""variation_id์ ๋ฐ๋ฅธ ๋ค์์ฑ ์ง์๋ฌธ ๋ฐํ""" |
|
|
diversity_instructions = [ |
|
|
"", |
|
|
"", |
|
|
"", |
|
|
"" |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return diversity_instructions[variation_id % len(diversity_instructions)] |
|
|
|
|
|
def list_prompts(self) -> Dict[str, PromptTemplate]: |
|
|
"""๋ชจ๋ ํ๋กฌํํธ ํ
ํ๋ฆฟ ๋ชฉ๋ก ๋ฐํ""" |
|
|
return self.prompts.copy() |
|
|
|
|
|
def get_prompts_by_type(self, benchmark: BenchmarkType) -> Dict[str, PromptTemplate]: |
|
|
"""๋ฒค์น๋งํฌ ํ์
๋ณ ํ๋กฌํํธ ๋ฐํ""" |
|
|
return { |
|
|
key: template for key, template in self.prompts.items() |
|
|
if template.benchmark == benchmark or template.benchmark == BenchmarkType.GENERAL |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
prompt_manager = PromptManager() |
|
|
|
|
|
|
|
|
|
|
|
def get_prompt(prompt_key: str, **kwargs) -> str: |
|
|
"""ํ๋กฌํํธ ๊ฐ์ ธ์ค๊ธฐ ํธ์ ํจ์""" |
|
|
return prompt_manager.get_prompt(prompt_key, **kwargs) |
|
|
|
|
|
|
|
|
def get_temperature(prompt_key: str) -> float: |
|
|
"""ํ๋กฌํํธ temperature ๊ฐ์ ธ์ค๊ธฐ ํธ์ ํจ์""" |
|
|
return prompt_manager.get_temperature(prompt_key) |
|
|
|
|
|
|
|
|
def get_diversity_instruction(variation_id: int) -> str: |
|
|
"""๋ค์์ฑ ์ง์๋ฌธ ๊ฐ์ ธ์ค๊ธฐ ํธ์ ํจ์""" |
|
|
return prompt_manager.get_diversity_instruction(variation_id) |