hjkim00 commited on Aug 27

Commit

24c2665

verified ·

1 Parent(s): e7b37ff

Restore all essential files - code, configs, and MBPP/HumanEval data

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +10 -0
LICENSE +21 -0
README.md +581 -0
Update/2025-01-25_humaneval_fixes.md +113 -0
Update/Phase1_Infrastructure_Setup.md +65 -0
Update/Phase2_Benchmark_System.md +85 -0
Update/Phase3_AZR_Template_Integration.md +100 -0
Update/Phase3_IPO_Extraction.md +129 -0
Update/Phase4_Complete_Pipeline_Implementation.md +203 -0
Update/Phase5_Critical_Bug_Fixes_and_EvalPlus_Integration.md +226 -0
Update/unified_ttrlvr_architecture.md +646 -0
absolute_zero_reasoner/__init__.py +0 -0
absolute_zero_reasoner/configs/azr_ppo_trainer.yaml +605 -0
absolute_zero_reasoner/data_construction/__init__.py +0 -0
absolute_zero_reasoner/data_construction/constructor.py +225 -0
absolute_zero_reasoner/data_construction/process_code_reasoning_data.py +175 -0
absolute_zero_reasoner/data_construction/process_data.py +210 -0
absolute_zero_reasoner/data_construction/prompts.py +546 -0
absolute_zero_reasoner/main_azr_ppo.py +260 -0
absolute_zero_reasoner/rewards/__init__.py +0 -0
absolute_zero_reasoner/rewards/code_reward.py +554 -0
absolute_zero_reasoner/rewards/custom_evaluate.py +387 -0
absolute_zero_reasoner/rewards/math_utils.py +490 -0
absolute_zero_reasoner/rewards/reward_managers.py +898 -0
absolute_zero_reasoner/rewards/ttrlvr_reward_manager.py +244 -0
absolute_zero_reasoner/testtime/__init__.py +34 -0
absolute_zero_reasoner/testtime/benchmark_loader.py +223 -0
absolute_zero_reasoner/testtime/complete_pipeline.py +0 -0
absolute_zero_reasoner/testtime/config.py +162 -0
absolute_zero_reasoner/testtime/ipo_extractor.py +1235 -0
absolute_zero_reasoner/testtime/logger.py +295 -0
absolute_zero_reasoner/testtime/prompts.py +413 -0
absolute_zero_reasoner/testtime/solution_generator.py +877 -0
absolute_zero_reasoner/testtime/task_generator.py +473 -0
absolute_zero_reasoner/trainer/__init__.py +0 -0
absolute_zero_reasoner/trainer/ppo/__init__.py +0 -0
absolute_zero_reasoner/trainer/ppo/azr_ray_trainer.py +0 -0
absolute_zero_reasoner/trainer/ppo/reason_rl_ray_trainer.py +768 -0
absolute_zero_reasoner/trainer/ppo/ttrlvr_azr_integration.py +125 -0
absolute_zero_reasoner/utils/__init__.py +0 -0
absolute_zero_reasoner/utils/auxiliary.py +11 -0
absolute_zero_reasoner/utils/code_utils/__init__.py +0 -0
absolute_zero_reasoner/utils/code_utils/checks.py +182 -0
absolute_zero_reasoner/utils/code_utils/parsers.py +202 -0
absolute_zero_reasoner/utils/code_utils/python_executor.py +435 -0
absolute_zero_reasoner/utils/code_utils/sandboxfusion_executor.py +372 -0
absolute_zero_reasoner/utils/code_utils/templates.py +68 -0
absolute_zero_reasoner/utils/convert2hf.py +55 -0
absolute_zero_reasoner/utils/dataset/__init__.py +0 -0
absolute_zero_reasoner/utils/dataset/ipo_grouped_sampler.py +220 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/absolute_zero_paradigm.png filter=lfs diff=lfs merge=lfs -text
+assets/azr.png filter=lfs diff=lfs merge=lfs -text
+evaluation/code_eval/coding/evalplus/gallary/render.gif filter=lfs diff=lfs merge=lfs -text
+evaluation/math_eval/eval/data/tabmwp/test.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/math_eval/eval/latex2sympy/antlr-4.11.1-complete.jar filter=lfs diff=lfs merge=lfs -text
+evaluation/math_eval/latex2sympy/antlr-4.11.1-complete.jar filter=lfs diff=lfs merge=lfs -text
+logs/task_generation/tasks_Mbpp_7.json filter=lfs diff=lfs merge=lfs -text
+test/logs/task_generation/tasks_HumanEval_28.json filter=lfs diff=lfs merge=lfs -text
+test/logs/task_generation/tasks_Mbpp_2.json filter=lfs diff=lfs merge=lfs -text
+test/logs/task_generation/tasks_Mbpp_242.json filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 LeapLab
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,581 @@

+<div align="center">
+# TestTime RLVR: Test-Time Reinforcement Learning with Verification and Reasoning
+*Based on Absolute Zero Reasoner (AZR) Methodology*
+[![Paper](https://img.shields.io/badge/paper-A42C25?style=for-the-badge&logo=arxiv&logoColor=white)](https://arxiv.org/abs/2505.03335)    [![Project Page](https://img.shields.io/badge/Project%20Page-blue?style=for-the-badge&logo=snowflake&logoColor=white&labelColor=black)](https://andrewzh112.github.io/absolute-zero-reasoner/)    [![Github](https://img.shields.io/badge/Code-000000?style=for-the-badge&logo=github&logoColor=000&logoColor=white)](https://github.com/LeapLabTHU/Absolute-Zero-Reasoner)    [![Hugging Face Collection](https://img.shields.io/badge/AZR_Collection-fcd022?style=for-the-badge&logo=huggingface&logoColor=000)](https://huggingface.co/collections/andrewzh/absolute-zero-reasoner-68139b2bca82afb00bc69e5b)    [![W&B Logs](https://img.shields.io/badge/📁_W%26B_Logs-fcd022?style=for-the-badge&logo=wandb&logoColor=000)](https://wandb.ai/andrewzhao112/AbsoluteZeroReasoner)
+<div align="center" style="font-family: Arial, sans-serif;">
+  <p>
+    <a href="#news" style="text-decoration: none; font-weight: bold;">🎉 News</a> •
+    <a href="#links" style="text-decoration: none; font-weight: bold;">🔗 Links</a> •
+    <a href="#todo" style="text-decoration: none; font-weight: bold;">📝 Roadmap</a> •
+    <a href="#algorithm-flow" style="text-decoration: none; font-weight: bold;">⚙️ Algorithm Flow</a> •
+    <a href="#results" style="text-decoration: none; font-weight: bold;">📊 Results</a>
+  </p>
+  <p>
+    <a href="#getting-started" style="text-decoration: none; font-weight: bold;">✨ Getting Started</a> •
+    <a href="#training" style="text-decoration: none; font-weight: bold;">🏋️ Training</a> •
+    <a href="#usage" style="text-decoration: none; font-weight: bold;">🔧 Usage</a> •
+    <a href="#evaluation-code" style="text-decoration: none; font-weight: bold;">📃 Evaluation</a>
+  </p>
+  <p>
+    <a href="#citation" style="text-decoration: none; font-weight: bold;">🎈 Citation</a> •
+    <a href="#acknowledgement" style="text-decoration: none; font-weight: bold;">🌻 Acknowledgement</a> •
+    <a href="#contact" style="text-decoration: none; font-weight: bold;">📧 Contact</a> •
+    <a href="#star-history" style="text-decoration: none; font-weight: bold;">📈 Star History</a>
+  </p>
+</div>
+</div>
+# 🚀 TestTime RLVR Implementation
+## 📋 Overview
+TestTime RLVR implements test-time reinforcement learning for enhanced reasoning capabilities using the AZR (Absolute Zero Reasoner) methodology. The system generates Input-Program-Output (IPO) triples from benchmark problems and creates three types of reasoning tasks (induction, deduction, abduction) to improve model performance at test time.
+## 🎯 Key Features
+- **Complete Pipeline**: LLM Solution Generation → IPO Extraction → Task Generation → LLM Evaluation → Reward Computation
+- **AZR Integration**: Full integration with Absolute Zero Reasoner templates and evaluation methods
+- **Benchmark Support**: MBPP+ and HumanEval+ datasets with structured data extraction
+- **Execution-based Evaluation**: Program execution comparison instead of string matching
+- **VLLM Optimization**: Faster inference with VLLM backend support
+## 📈 Implementation Status
+- ✅ **Phase 1**: Infrastructure Setup - Complete pipeline architecture
+- ✅ **Phase 2**: Benchmark System - MBPP+/HumanEval+ integration
+- ✅ **Phase 3**: AZR Template Integration - Three reasoning tasks implementation
+- ✅ **Phase 4**: Complete Pipeline - Fully functional end-to-end system
+- 🔄 **Phase 5**: RLVR Training - Reinforcement learning integration (In Progress)
+## 📦 Dataset Setup
+**Download required benchmark datasets:**
+```bash
+# Download MBPP+ and HumanEval+ datasets
+wget -O evaluation/code_eval/data/MbppPlus.jsonl https://huggingface.co/datasets/evalplus/mbppplus/resolve/main/MbppPlus.jsonl
+wget -O evaluation/code_eval/data/HumanEvalPlus.jsonl https://huggingface.co/datasets/evalplus/humanevalplus/resolve/main/HumanEvalPlus.jsonl
+```
+## 🚀 Quick Start
+### Running the Pipeline
+```bash
+# Navigate to test directory
+cd test/
+# Set GPU device
+export CUDA_VISIBLE_DEVICES=6
+# Execute complete pipeline
+bash run_testtime_gpu6.sh
+```
+### Command Line Options
+```bash
+# From test/ directory
+python test_complete_pipeline.py \
+    --model "Qwen/Qwen2.5-7B" \
+    --benchmark "mbpp" \
+    --problem_id "Mbpp/478" \
+    --max_tokens 2048 \
+    --gpu 6 \
+    --verbose \
+    --output_dir ../tmp
+```
+### Batch Evaluation
+```bash
+# From test/ directory
+bash run_batch_evaluation.sh "Qwen/Qwen2.5-7B" "mbpp" 10 6
+```
+### Supported Benchmarks
+- **MBPP+**: `--benchmark mbpp --problem_id "Mbpp/X"`
+- **HumanEval+**: `--benchmark humaneval --problem_id "HumanEval/X"`
+- **Test Mode**: `--benchmark test` (example problems)
+## 📊 Results Structure
+```
+tmp/{benchmark}/{problem_id}/          # Single problem results
+├── initial_solution/                  # LLM's original solution + correctness
+│   ├── {problem_id}_original_problem.txt    # Original benchmark problem
+│   ├── {problem_id}_llm_solution.txt        # LLM solution + correctness evaluation
+│   └── {problem_id}_extracted_program.py   # Extracted function code
+├── ipo_triples/                      # Input-Program-Output triples
+├── task_prompts/                     # Generated reasoning tasks
+├── llm_responses/                    # LLM responses to tasks
+├── extracted_answers/                # Extracted answers from responses
+├── {problem_id}_reward_analysis.json
+├── {problem_id}_reward_summary.txt
+└── {problem_id}_pipeline_summary.json
+test/batch_results/                   # Batch evaluation results
+├── batch_evaluation_{timestamp}/
+│   ├── batch_evaluation_results.json      # Detailed results with correctness stats
+│   └── evaluation_summary.md              # Summary report with accuracy rates
+```
+<!-- ============================================== -->
+<div align="left">
+  <h1 id="links">🔗 AZR References</h1>
+  <hr style="height: 3px; background: linear-gradient(90deg, #EF8E8D, #5755A3); border: none; border-radius: 3px;">
+</div>
+- 🏠 [[AZR Project Page]](https://andrewzh112.github.io/absolute-zero-reasoner/)
+- 📜 [[AZR Paper]](https://arxiv.org/abs/2505.03335)
+- 🤗 [[AZR Models]](https://huggingface.co/collections/andrewzh/absolute-zero-reasoner-68139b2bca82afb00bc69e5b)
+- 💻 [[AZR Code]](https://github.com/LeapLabTHU/Absolute-Zero-Reasoner)
+- 📁 [[AZR Logs]](https://wandb.ai/andrewzhao112/AbsoluteZeroReasoner)
+<!-- ============================================== -->
+<div align="left">
+  <h1 id="todo">📝 TestTime RLVR Roadmap</h1>
+  <hr style="height: 3px; background: linear-gradient(90deg, #EF8E8D, #5755A3); border: none; border-radius: 3px;">
+</div>
+<div style="margin-bottom: 0.8rem; padding: 0.8rem 1.2rem; background-color: rgba(87, 85, 163, 0.1); border-left: 5px solid #5755A3; border-radius: 8px; display: flex; align-items: center;">
+  <span style="font-size: 1.2em; margin-right: 0.8rem; color: #5755A3;">✅</span>
+  <span style="text-decoration: line-through; color: #AAA; font-size: 1.1em;">Complete Pipeline Implementation</span>
+</div>
+<div style="margin-bottom: 0.8rem; padding: 0.8rem 1.2rem; background-color: rgba(87, 85, 163, 0.1); border-left: 5px solid #5755A3; border-radius: 8px; display: flex; align-items: center;">
+  <span style="font-size: 1.2em; margin-right: 0.8rem; color: #5755A3;">✅</span>
+  <span style="text-decoration: line-through; color: #AAA; font-size: 1.1em;">IPO Triple Extraction with Structured Data</span>
+</div>
+<div style="margin-bottom: 0.8rem; padding: 0.8rem 1.2rem; background-color: rgba(87, 85, 163, 0.1); border-left: 5px solid #5755A3; border-radius: 8px; display: flex; align-items: center;">
+  <span style="font-size: 1.2em; margin-right: 0.8rem; color: #5755A3;">✅</span>
+  <span style="text-decoration: line-through; color: #AAA; font-size: 1.1em;">Three Reasoning Tasks (Induction/Deduction/Abduction)</span>
+</div>
+<div style="margin-bottom: 0.8rem; padding: 0.8rem 1.2rem; background-color: rgba(87, 85, 163, 0.1); border-left: 5px solid #5755A3; border-radius: 8px; display: flex; align-items: center;">
+  <span style="font-size: 1.2em; margin-right: 0.8rem; color: #5755A3;">✅</span>
+  <span style="text-decoration: line-through; color: #AAA; font-size: 1.1em;">Execution-based Evaluation System</span>
+</div>
+<div style="margin-bottom: 0.8rem; padding: 0.8rem 1.2rem; background-color: rgba(239, 142, 141, 0.1); border-left: 5px solid #EF8E8D; border-radius: 8px; display: flex; align-items: center;">
+  <span style="font-size: 1.2em; margin-right: 0.8rem; color: #EF8E8D;">🔄</span>
+  <span style="color: #333; font-size: 1.1em;">VeRL Integration for RLVR Training</span>
+</div>
+<div style="margin-bottom: 0.8rem; padding: 0.8rem 1.2rem; background-color: rgba(239, 142, 141, 0.1); border-left: 5px solid #EF8E8D; border-radius: 8px; display: flex; align-items: center;">
+  <span style="font-size: 1.2em; margin-right: 0.8rem; color: #EF8E8D;">📋</span>
+  <span style="color: #333; font-size: 1.1em;">Multi-Problem Batch Processing</span>
+</div>
+<!-- ============================================== -->
+<div align="left">
+  <h1 id="algorithm-flow">⚙️ TestTime RLVR Algorithm Flow</h1>
+  <hr style="height: 3px; background: linear-gradient(90deg, #EF8E8D, #5755A3); border: none; border-radius: 3px;">
+</div>
+TestTime RLVR implements a comprehensive test-time reasoning pipeline based on AZR methodology:
+### 🔄 Pipeline Stages
+1. **<span style="color:#EF8E8D">LLM Solution Generation</span>**: The model generates an initial solution for a given benchmark problem (MBPP+/HumanEval+)
+2. **<span style="color:#5755A3">IPO Triple Extraction</span>**: Input-Program-Output triples are created using structured benchmark data and LLM solution execution
+3. **<span style="color:#EF8E8D">Task Generation</span>**: Three types of reasoning tasks are generated:
+   - **Induction**: Deduce function from input/output pairs + message
+   - **Deduction**: Predict output from code + input
+   - **Abduction**: Predict input from code + output
+4. **<span style="color:#5755A3">LLM Evaluation</span>**: The model attempts to solve the generated reasoning tasks using AZR prompts and templates
+5. **<span style="color:#EF8E8D">Reward Computation</span>**: Solutions are verified through program execution, receiving accuracy-based rewards
+### 🎯 Key Innovations
+- **Structured Data Integration**: Direct use of benchmark `base_input`/`plus_input` instead of assert parsing
+- **Execution-based Evaluation**: Program execution comparison for accurate task evaluation
+- **Function Name Normalization**: Consistent `f` function naming following AZR methodology
+- **Docstring Utilization**: LLM-generated docstrings enhance induction task quality
+<!-- ============================================== -->
+<div align="left">
+  <h1 id="results">📊 Results</h1>
+  <hr style="height: 3px; background: linear-gradient(90deg, #EF8E8D, #5755A3); border: none; border-radius: 3px;">
+</div>
+## Main Results
+Our approach achieves strong performance across both code and math reasoning benchmarks without using any external data:
+<table>
+  <thead>
+    <tr>
+      <th align="center">Model</th>
+      <th align="center">Base</th>
+      <th align="center">#data</th>
+      <th align="center">Code Avg</th>
+      <th align="center">Math Avg</th>
+      <th align="center">Total Avg</th>
+    </tr>
+  </thead>
+  <tbody>
+    <!-- Base Models Section -->
+    <tr>
+      <td colspan="6" align="center"><b>Base Models</b></td>
+    </tr>
+    <tr>
+      <td>Qwen2.5-7B</td>
+      <td>-</td>
+      <td>-</td>
+      <td>52.0</td>
+      <td>27.5</td>
+      <td>39.8</td>
+    </tr>
+    <tr>
+      <td>Qwen2.5-7B-Ins</td>
+      <td>-</td>
+      <td>-</td>
+      <td>56.3</td>
+      <td>37.0</td>
+      <td>46.7</td>
+    </tr>
+    <tr>
+      <td>Qwen2.5-7B-Coder</td>
+      <td>-</td>
+      <td>-</td>
+      <td>56.6</td>
+      <td>23.9</td>
+      <td>40.2</td>
+    </tr>
+    <!-- Zero-Style Reasoners with Code Data -->
+    <tr>
+      <td colspan="6" align="center"><b>Reasoners Trained on Curated Code Data</b></td>
+    </tr>
+    <tr>
+      <td>AceCoder-RM</td>
+      <td>Ins</td>
+      <td>22k</td>
+      <td>58.3</td>
+      <td>37.4</td>
+      <td>47.9</td>
+    </tr>
+    <tr>
+      <td>AceCoder-RM</td>
+      <td>Coder</td>
+      <td>22k</td>
+      <td>57.3</td>
+      <td>27.5</td>
+      <td>42.4</td>
+    </tr>
+    <tr>
+      <td>AceCoder-Rule</td>
+      <td>Ins</td>
+      <td>22k</td>
+      <td>55.4</td>
+      <td>36.9</td>
+      <td>46.2</td>
+    </tr>
+    <tr>
+      <td>AceCoder-Rule</td>
+      <td>Coder</td>
+      <td>22k</td>
+      <td>60.0</td>
+      <td>28.5</td>
+      <td>44.3</td>
+    </tr>
+    <tr>
+      <td>CodeR1-LC2k</td>
+      <td>Ins</td>
+      <td>2k</td>
+      <td>60.5</td>
+      <td>35.6</td>
+      <td>48.0</td>
+    </tr>
+    <tr>
+      <td>CodeR1-12k</td>
+      <td>Ins</td>
+      <td>10k</td>
+      <td>61.3</td>
+      <td>33.5</td>
+      <td>47.4</td>
+    </tr>
+    <!-- Zero-Style Reasoners with Math Data -->
+    <tr>
+      <td colspan="6" align="center"><b>Reasoners Trained on Curated Math Data</b></td>
+    </tr>
+    <tr>
+      <td>PRIME-Zero</td>
+      <td>Coder</td>
+      <td>484k</td>
+      <td>37.2</td>
+      <td><b>45.8</b></td>
+      <td>41.5</td>
+    </tr>
+    <tr>
+      <td>SimpleRL-Zoo</td>
+      <td>Base</td>
+      <td>8.5k</td>
+      <td>54.0</td>
+      <td>38.5</td>
+      <td>46.3</td>
+    </tr>
+    <tr>
+      <td>Oat-Zero</td>
+      <td>Math</td>
+      <td>8.5k</td>
+      <td>45.4</td>
+      <td>44.3</td>
+      <td>44.9</td>
+    </tr>
+    <tr>
+      <td>ORZ</td>
+      <td>Base</td>
+      <td>57k</td>
+      <td>55.6</td>
+      <td>41.6</td>
+      <td>48.6</td>
+    </tr>
+    <!-- Our Approach -->
+    <tr style="background-color: rgba(239, 142, 141, 0.1);">
+      <td colspan="6" align="center"><b>Absolute Zero Training w/ No Curated Data (Ours)</b></td>
+    </tr>
+    <tr style="background-color: rgba(239, 142, 141, 0.1);">
+      <td>AZR (Ours)</td>
+      <td>Base</td>
+      <td><b>0</b></td>
+      <td>55.2 <span style="color:#00AA00">+3.2</span></td>
+      <td>38.4 <span style="color:#00AA00">+10.9</span></td>
+      <td>46.8 <span style="color:#00AA00">+7.0</span></td>
+    </tr>
+    <tr style="background-color: rgba(87, 85, 163, 0.1);">
+      <td>AZR (Ours)</td>
+      <td>Coder</td>
+      <td><b>0</b></td>
+      <td><b>61.6</b> <span style="color:#00AA00">+5.0</span></td>
+      <td>39.1 <span style="color:#00AA00">+15.2</span></td>
+      <td><b>50.4</b> <span style="color:#00AA00">+10.2</span></td>
+    </tr>
+  </tbody>
+</table>
+## Scaling Results
+AZR shows consistent improvements across model sizes and types:
+<table>
+  <thead>
+    <tr>
+      <th align="center">Model Family</th>
+      <th align="center">Variant</th>
+      <th align="center">Code Avg</th>
+      <th align="center">Math Avg</th>
+      <th align="center">Total Avg</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Llama3.1-8b</td>
+      <td></td>
+      <td>28.5</td>
+      <td>3.4</td>
+      <td>16.0</td>
+    </tr>
+    <tr style="background-color: rgba(87, 85, 163, 0.1);">
+      <td>Llama3.1-8b</td>
+      <td>+ AZR (Ours)</td>
+      <td>31.6 <span style="color:#00AA00">+3.1</span></td>
+      <td>6.8 <span style="color:#00AA00">+3.4</span></td>
+      <td>19.2 <span style="color:#00AA00">+3.2</span></td>
+    </tr>
+    <tr>
+      <td>Qwen2.5-3B Coder</td>
+      <td></td>
+      <td>51.2</td>
+      <td>18.8</td>
+      <td>35.0</td>
+    </tr>
+    <tr style="background-color: rgba(87, 85, 163, 0.1);">
+      <td>Qwen2.5-3B Coder</td>
+      <td>+ AZR (Ours)</td>
+      <td>54.9 <span style="color:#00AA00">+3.7</span></td>
+      <td>26.5 <span style="color:#00AA00">+7.7</span></td>
+      <td>40.7 <span style="color:#00AA00">+5.7</span></td>
+    </tr>
+    <tr>
+      <td>Qwen2.5-7B Coder</td>
+      <td></td>
+      <td>56.6</td>
+      <td>23.9</td>
+      <td>40.2</td>
+    </tr>
+    <tr style="background-color: rgba(87, 85, 163, 0.1);">
+      <td>Qwen2.5-7B Coder</td>
+      <td>+ AZR (Ours)</td>
+      <td>61.6 <span style="color:#00AA00">+5.0</span></td>
+      <td>39.1 <span style="color:#00AA00">+15.2</span></td>
+      <td>50.4 <span style="color:#00AA00">+10.2</span></td>
+    </tr>
+    <tr>
+      <td>Qwen2.5-14B Coder</td>
+      <td></td>
+      <td>60.0</td>
+      <td>20.2</td>
+      <td>40.1</td>
+    </tr>
+    <tr style="background-color: rgba(87, 85, 163, 0.1);">
+      <td>Qwen2.5-14B Coder</td>
+      <td>+ AZR (Ours)</td>
+      <td>63.6 <span style="color:#00AA00">+3.6</span></td>
+      <td>43.0 <span style="color:#00AA00">+22.8</span></td>
+      <td>53.3 <span style="color:#00AA00">+13.2</span></td>
+    </tr>
+  </tbody>
+</table>
+<!-- ============================================== -->
+<div align="left">
+  <h1 id="getting-started">✨ Getting Started</h1>
+  <hr style="height: 3px; background: linear-gradient(90deg, #EF8E8D, #5755A3); border: none; border-radius: 3px;">
+</div>
+## 🎄 Environment Setup
+```bash
+conda env create -f azr_env.yml
+conda activate azr
+pip install -r flashattn_requirements.txt
+```
+## 💾 Data Processing
+### Process evaluation data on CruxEval / LiveCodeBench Execution during AZR Self-play
+```bash
+python -m absolute_zero_reasoner.data_construction.process_code_reasoning_data
+```
+<!-- ============================================== -->
+<div align="left">
+  <h1 id="training">🏋️ Training</h1>
+  <hr style="height: 3px; background: linear-gradient(90deg, #EF8E8D, #5755A3); border: none; border-radius: 3px;">
+</div>
+> **⚠️WARNING⚠️**: The Python executor in this repository is very raw and intended for research purposes only. It is not secure for production environments. We plan to update our executor to more secure implementations in the future. Your use of our code is at your own discretion and risk.
+## 🫛 Seeding (Optional)
+We provide the seed datasets we collected by prompting each model in data/. If you want to create your own seed data, use the following script:
+```bash
+export OUTPUT_SEED_PATH=data/<new_ded_abd_seed_data_name>.jsonl
+export OUTPUT_CODE_F_SEED_PATH=data/<new_ind_seed_data_name>.jsonl
+bash scripts/seeding/<7b|14b|coder3b|coder7b|coder14b|llama>.sh
+```
+## ♟️ Self-play
+3b models need 2 X 80gb GPUs, 7/8b models need 4 X 80gb, 14b requires 8 X 80gb
+```bash
+bash scripts/selfplay/<7b|14b|coder3b|coder7b|coder14b|llama>.sh
+```
+If you want to use your own ded/abd or ind seed dataset:
+```bash
+export OUTPUT_SEED_PATH=data/<your_ded_abd_seed_data_name>.jsonl
+export OUTPUT_CODE_F_SEED_PATH=data/<your_ind_seed_data_name>.jsonl
+bash scripts/selfplay/<7b|14b|coder3b|coder7b|coder14b|llama>.sh
+```
+For using the newly supported sandbox-fusion executor, use docker and set `azr.executor=sandboxfusion`.
+## 🌚 Resuming Runs
+When resuming runs, put the original run wandb id into the script, i.e., `trainer.wandb_run_id=<run_id>`.
+## 🤗 Converting veRL checkpoints to HF format
+```bash
+python -m absolute_zero_reasoner.utils.convert2hf \
+  <veRL_ckpt_path>/actor \
+  <veRL_ckpt_path>/actor/huggingface/ \
+  <hf_ckpt_path>
+```
+## 📈Design Your Own Intrinsic Rewards!
+In configs, just add your own rewards to `azr.reward.generation_reward_config`, check the ones already implemented such as diversity and complexity rewards. Be Creative!
+<!-- ============================================== -->
+<div align="left">
+  <h1 id="usage">🔧 Usage</h1>
+  <hr style="height: 3px; background: linear-gradient(90deg, #EF8E8D, #5755A3); border: none; border-radius: 3px;">
+</div>
+We use the Deepseek R1 <think> & <answer> tags as prompt template:
+```
+A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. User: {question}\nAssistant: <think>
+```
+<!-- ============================================== -->
+<div align="left">
+  <h1 id="evaluation-code">📃 Evaluation Code</h1>
+  <hr style="height: 3px; background: linear-gradient(90deg, #EF8E8D, #5755A3); border: none; border-radius: 3px;">
+</div>
+## LiveCodeBench
+Setup: LCB needs to first download the data
+```bash
+git clone https://hf-mirror.com/datasets/livecodebench/code_generation_lite evaluation/code_eval/coding/LiveCodeBench/code_generation_lite
+```
+Evaluation:
+```bash
+bash evaluation/code_eval/scripts/run_lcb_gen.sh --model <andrewzh/Absolute_Zero_Reasoner-Coder-3b>
+```
+## Evalplus
+New conda env is neede for evalplus
+```bash
+conda create -n evalplus python=3.11
+pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus@d362e933265c3e7e3df8101c930a89c3c470cd9f"
+Evaluation:
+```bash
+condda activate evalplus
+bash evaluation/code_eval/scripts/run_evalplus.sh 0 <humaneval|mbpp> <andrewzh/Absolute_Zero_Reasoner-Coder-3b>
+```
+## Math
+Please refer to [evaluation/math_eval/README.md](evaluation/math_eval/README.md) for math evaluation.
+<!-- ============================================== -->
+<div align="left">
+  <h1 id="citation">🎈 Citation</h1>
+  <hr style="height: 3px; background: linear-gradient(90deg, #EF8E8D, #5755A3); border: none; border-radius: 3px;">
+</div>
+If you find Absolute Zero Reasoner helpful, please cite us.
+```bibtex
+@misc{zhao2025absolutezeroreinforcedselfplay,
+      title={Absolute Zero: Reinforced Self-play Reasoning with Zero Data},
+      author={Andrew Zhao and Yiran Wu and Yang Yue and Tong Wu and Quentin Xu and Yang Yue and Matthieu Lin and Shenzhi Wang and Qingyun Wu and Zilong Zheng and Gao Huang},
+      year={2025},
+      eprint={2505.03335},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2505.03335},
+}
+```
+<!-- ============================================== -->
+<div align="left">
+  <h1 id="acknowledgement">🌻 Acknowledgement</h1>
+  <hr style="height: 3px; background: linear-gradient(90deg, #EF8E8D, #5755A3); border: none; border-radius: 3px;">
+</div>
+Our reinforcement learning training codebase is a fork of the [veRL framework](https://github.com/volcengine/verl). For rollouts, we used [vLLM](https://github.com/vllm-project/vllm). The Python executor components are adapted from the [QwQ Repository](https://github.com/QwenLM/QwQ/tree/main/eval/eval/math_opensource_utils). Additionally, we borrowed our README structure from [PRIME](https://github.com/PRIME-RL/PRIME).
+Many thanks to the authors of these projects for their excellent contributions!
+<!-- ============================================== -->
+<div align="left">
+  <h1 id="contact">📧 Contact</h1>
+  <hr style="height: 3px; background: linear-gradient(90deg, #EF8E8D, #5755A3); border: none; border-radius: 3px;">
+</div>
+Feel free to contact Andrew Zhao via email: [email protected]
+<!-- ============================================== -->
+<div align="left">
+  <h1 id="star-history">📈 Star History</h1>
+  <hr style="height: 3px; background: linear-gradient(90deg, #EF8E8D, #5755A3); border: none; border-radius: 3px;">
+</div>
+[![Star History Chart](https://api.star-history.com/svg?repos=LeapLabTHU/Absolute-Zero-Reasoner&type=Date)](https://www.star-history.com/#LeapLabTHU/Absolute-Zero-Reasoner&Date)

Update/2025-01-25_humaneval_fixes.md ADDED Viewed

	@@ -0,0 +1,113 @@

+# TestTime RLVR-v2 HumanEval 평가 수정 사항
+날짜: 2025-01-25
+## 개요
+HumanEval 벤치마크에서 0% 정확도 문제를 해결하기 위한 전체적인 수정 작업을 수행했습니다.
+## 주요 문제점 및 해결 방안
+### 1. Import 문 누락 문제
+**문제**: HumanEval 솔루션에서 `from typing import List` 등의 import 문이 누락되어 실행 실패
+**해결**:
+- EvalPlus 방식과 동일하게 프롬프트에서 import 문을 추출하여 자동 추가
+- `_add_imports_from_prompt()` 메서드 추가
+- 자동으로 import를 추가하는 치팅 방식 제거
+### 2. IPO Triple 추출 문제
+**문제**:
+- base_input의 첫 번째 항목만 사용
+- HumanEval에서 테스트 케이스를 사용하여 IPO 생성 (치팅)
+**해결**:
+- HumanEval은 docstring 예제만 사용하도록 변경
+- `_extract_docstring_examples()` 메서드 추가
+- 입력 형식 분리: 평가용 인자와 표시용 전체 함수 호출
+### 3. 프롬프트 일관성 문제
+**문제**:
+- `batch_evaluate_testtime.py`의 하드코딩된 프롬프트가 `solution_generator.py`와 불일치
+- HumanEval/50과 같은 다중 함수 문제 처리 미흡
+**해결**:
+- 모든 프롬프트를 `solution_generator.py`와 일치하도록 수정
+- 다중 함수 케이스를 위한 특별 처리 추가
+### 4. Task 생성 시 문제
+**문제**:
+- HumanEval에서 doctest 예시가 포함되어 치팅 발생
+- Induction task의 message가 일반적인 메시지 사용
+**해결**:
+- `_remove_doctest_examples()` 메서드로 doctest 제거
+- HumanEval의 경우 함수 설명을 추출하여 message로 사용
+### 5. 평가 실패 문제
+**문제**:
+- Induction: 전체 함수 호출을 사용하여 평가 실패
+- Abduction: 인자만 저장되어 MBPP와 다른 형식으로 평가
+**해결**:
+- IPO triple에 `input`(인자)와 `full_input_str`(전체 호출) 분리 저장
+- Abduction expected_solution을 `full_input_str` 사용하도록 수정
+## 수정된 파일 목록
+### 1. `/home/ubuntu/RLVR/TestTime-RLVR-v2/absolute_zero_reasoner/testtime/solution_generator.py`
+- `_add_imports_from_prompt()` 메서드 추가
+- `_add_missing_imports()` 제거 (치팅 방지)
+- HumanEval용 프롬프트 개선
+- 다중 함수 처리 로직 추가
+### 2. `/home/ubuntu/RLVR/TestTime-RLVR-v2/absolute_zero_reasoner/testtime/ipo_extractor.py`
+- `_extract_docstring_examples()` 메서드 추가
+- HumanEval은 docstring 예제만 사용하도록 수정
+- 입력 형식 분리 (평가용/표시용)
+### 3. `/home/ubuntu/RLVR/TestTime-RLVR-v2/absolute_zero_reasoner/testtime/task_generator.py`
+- `_remove_doctest_examples()` 메서드 추가
+- `_extract_function_description()` 메서드 추가
+- HumanEval induction message 개선
+- Abduction expected_solution을 전체 함수 호출로 수정
+### 4. `/home/ubuntu/RLVR/TestTime-RLVR-v2/test/batch_evaluate_testtime.py`
+- 하드코딩된 프롬프트를 `solution_generator.py`와 일치하도록 수정
+- 전체 LLM 프롬프트 로깅 추가
+## 기술적 세부사항
+### IPO Triple 형식 차이
+```json
+// MBPP (기존)
+{
+  "input": "intersperse([], 4)",
+  "full_input_str": "intersperse([], 4)"
+}
+// HumanEval (수정됨)
+{
+  "input": "[], 4",  // 평가용 (인자만)
+  "full_input_str": "intersperse([], 4)"  // 표시용 (전체 호출)
+}
+```
+### Import 추출 로직
+```python
+def _add_imports_from_prompt(self, prompt: str, solution: str) -> str:
+    # 프롬프트에서 import 문 추출
+    # solution 앞에 import 문 추가
+    # EvalPlus와 동일한 방식
+```
+### Doctest 제거
+```python
+def _remove_doctest_examples(self, code: str) -> str:
+    # docstring 내의 >>> 예시 제거
+    # 함수 설명은 유지
+```
+## 성과
+- HumanEval 평가가 정상적으로 작동
+- 치팅 없이 공정한 평가 수행
+- MBPP와 일관된 평가 방식 유지
+- EvalPlus와 호환되는 import 처리
+## 향후 개선사항
+- 더 많은 HumanEval 문제에 대한 테스트 필요
+- 다양한 edge case 처리 개선
+- 성능 최적화

Update/Phase1_Infrastructure_Setup.md ADDED Viewed

	@@ -0,0 +1,65 @@

+# Phase 1: 기반 인프라 구축 완료
+## 📁 디렉토리 구조 설정
+### 새로 생성된 프로젝트
+- `/home/ubuntu/RLVR/TestTime-RLVR-v2/` - AZR 기반 새 프로젝트
+### 핵심 디렉토리 구조
+```
+TestTime-RLVR-v2/
+├── absolute_zero_reasoner/
+│   ├── testtime/                    # TestTime 전용 컴포넌트
+│   │   ├── __init__.py             # 모듈 초기화
+│   │   └── config.py               # TestTime 설정
+│   ├── utils/code_utils/           # AZR Python Executor (기존)
+│   ├── rewards/                    # AZR Reward Manager (기존)
+│   └── trainer/ppo/                # AZR PPO Trainer (기존)
+├── logs/                           # 로깅 시스템
+│   ├── problems/                   # 문제별 로그
+│   ├── ipo_extraction/            # IPO 추출 로그
+│   ├── task_generation/           # 태스크 생성 로그
+│   ├── training/                  # 학습 로그
+│   └── performance/               # 성능 변화 로그
+├── evaluation/code_eval/data/      # 벤치마크 데이터
+│   ├── HumanEvalPlus.jsonl        # ✅ 존재 확인
+│   └── MbppPlus.jsonl             # ✅ 존재 확인
+└── Update/                        # 변경사항 추적
+```
+## 🔧 생성된 핵심 컴포넌트
+### 1. TestTimeConfig 클래스
+- **위치**: `absolute_zero_reasoner/testtime/config.py`
+- **기능**: TestTime RLVR 전체 설정 관리
+- **특징**: AZR 호환성 유지하면서 TestTime 특화 설정 추가
+### 2. BenchmarkConfig 클래스
+- **위치**: `absolute_zero_reasoner/testtime/config.py`
+- **기능**: 벤치마크별 설정 (HumanEval+, MBPP+)
+- **특징**: 벤치마크별 시작 인덱스, 경로 등 관리
+## ✅ 완료된 작업
+1. **프로젝트 복사**: AZR → TestTime-RLVR-v2
+2. **디렉토리 구조**: 로그 및 컴포넌트 디렉토리 생성
+3. **기본 설정**: TestTimeConfig, BenchmarkConfig 클래스 생성
+4. **데이터 확인**: HumanEval+, MBPP+ 데이터 파일 존재 확인
+5. **모듈 구조**: testtime 패키지 초기화
+## 🎯 다음 단계 (Phase 2)
+1. **BenchmarkProblemLoader** 구현 - 벤치마크 문제 로딩
+2. **InitialSolutionGenerator** 구현 - 초기 솔루션 생성
+3. **벤치마크 검증 시스템** 구현 - 솔루션 정확성 검증
+## 📝 주요 설계 원칙
+- **AZR 호환성**: 기존 AZR 컴포넌트 최대한 재사용
+- **경량화**: TestTime에 적합한 빠른 적응 학습
+- **포괄적 로깅**: 모든 단계별 상세 로그 기록
+- **모듈성**: 각 컴포넌트 독립적 테스트 가능
+---
+**생성 일시**: 2025-07-16
+**상태**: ✅ 완료

Update/Phase2_Benchmark_System.md ADDED Viewed

	@@ -0,0 +1,85 @@

+# Phase 2: 벤치마크 문제 풀이 시스템 완료
+## ✅ 구현된 컴포넌트
+### 1. BenchmarkProblemLoader
+- **파일**: `absolute_zero_reasoner/testtime/benchmark_loader.py`
+- **기능**:
+  - HumanEval+, MBPP+ 문제 로딩
+  - 테스트 케이스 추출 (assert 문 파싱)
+  - 솔루션 검증 (구문 + 실행)
+  - 배치 로딩 및 통계 정보 제공
+- **기반**: 기존 `load_humaneval_problem` 함수 확장
+### 2. InitialSolutionGenerator
+- **파일**: `absolute_zero_reasoner/testtime/solution_generator.py`
+- **기능**:
+  - AZR 스타일 모델 로딩 (flash attention, gradient checkpointing)
+  - Greedy 생성 (AZR evaluation과 동일)
+  - 함수 정의 자동 복구
+  - 대체 솔루션 생성 (문제별 템플릿)
+- **기반**: 기존 `generate_initial_solution` 함수 클래스화
+### 3. TestTimeLogger
+- **파일**: `absolute_zero_reasoner/testtime/logger.py`
+- **기능**:
+  - 요구사항 1: 벤치마크 문제 + LLM 답변 + 정답 여부
+  - 요구사항 2: IPO 추출 + 태스크 생성 로그
+  - 요구사항 3: 태스크 정확도 + reward 로그
+  - 요구사항 4: VeRL 학습 진행 로그
+  - JSON 형태 구조화된 로그 저장
+### 4. 설정 시스템
+- **파일**: `absolute_zero_reasoner/testtime/config.py`
+- **클래스**: `TestTimeConfig`, `BenchmarkConfig`
+- **기능**: AZR 호환 + TestTime 특화 설정
+## 🧪 테스트 결과
+### 기본 기능 테스트 (✅ 3/3 통과)
+```
+Configuration: ✅ PASS
+Logger: ✅ PASS
+BenchmarkLoader: ✅ PASS
+```
+### 검증된 기능
+- ✅ MBPP 문제 로딩 (Mbpp/2 성공)
+- ✅ 문제 통계 (378개 문제 확인)
+- ✅ 로깅 시스템 (5개 카테고리)
+- ✅ 설정 관리 (AZR 호환)
+## 📁 생성된 구조
+```
+TestTime-RLVR-v2/absolute_zero_reasoner/testtime/
+├── __init__.py                # 패키지 초기화
+├── config.py                  # 설정 클래스
+├── benchmark_loader.py        # 벤치마크 로더
+├── solution_generator.py      # 솔루션 생성기
+└── logger.py                  # 로깅 시스템
+```
+## 🗑️ 정리된 항목
+- ✅ Python 캐시 파일 (`__pycache__`, `*.pyc`) 삭제
+- ✅ 불필요한 임포트 정리 (아직 구현되지 않은 컴포넌트 주석 처리)
+- ✅ 테스트 파일을 `/tmp/azr/`에 임시 저장
+## 🎯 다음 단계 (Phase 3)
+Phase 3에서 구현할 **IPO Triple 추출 시스템**:
+1. **IPOTripleExtractor** - AZR Python Executor 기반 IPO 추출
+2. **TripleValidator** - 추출된 트리플 검증
+3. **AZR 연동** - `utils/code_utils/python_executor.py` 활용
+### AZR 컴포넌트 활용 계획
+- `absolute_zero_reasoner/utils/code_utils/python_executor.py` - 코드 실행
+- `absolute_zero_reasoner/trainer/ppo/azr_ray_trainer.py:641-655` - IPO 생성 로직
+- `absolute_zero_reasoner/rewards/reward_managers.py:220-233` - 검증 로직
+---
+**생성 일시**: 2025-07-16
+**상태**: ✅ 완료
+**테스트**: ✅ 통과 (3/3)

Update/Phase3_AZR_Template_Integration.md ADDED Viewed

	@@ -0,0 +1,100 @@

+# Phase 3 개선: AZR 템플릿 직접 통합 완료
+## ✅ 주요 개선사항
+### 1. AZR 템플릿 직접 사용
+- **기존**: 단순화된 TestTime 전용 템플릿 (20-30라인)
+- **개선**: AZR 원본 템플릿 직접 활용 (2000+ 문자)
+- **효과**: 상세한 제약사항, 예시, 평가기준 포함
+### 2. 태스크 타입별 AZR 매핑
+| TestTime 태스크 | AZR 문제 타입 | 설명 |
+|-----------------|---------------|------|
+| **Induction**   | `code_f`     | 함수 생성 문제 |
+| **Deduction**   | `code_o`     | 출력 예측 문제 |
+| **Abduction**   | `code_i`     | 입력 생성 문제 |
+### 3. 코드 구조 최적화
+- **템플릿 임포트**: `from ..data_construction.prompts import get_code_problem_generator_prompt`
+- **불필요한 코드 제거**: 기존 단순 템플릿 코드 삭제 (150+ 라인 정리)
+- **매개변수 수정**: `composite_functions=[]` 추가로 오류 해결
+## 🧪 테스트 결과
+### AZR 템플릿 품질 비교
+```
+기존 TestTime 템플릿: 20-30라인, 기본적 설명
+AZR 템플릿: 2000+ 문자, 상세한 구조
+- 다양한 예시 제공
+- 명확한 제약사항
+- 체계적 평가기준
+- 단계별 추론 유도
+```
+### 생성된 프롬프트 예시
+- **Induction**: 2,274자 상세 프롬프트
+- **Deduction**: 3,057자 상세 프롬프트
+- **Abduction**: 3,063자 상세 프롬프트
+## 📂 정리된 파일
+### 불필요한 파일 삭제
+- ❌ `/tmp/azr/debug_ipo_failures.py`
+- ❌ `/tmp/azr/detailed_failure_analysis.py`
+- ❌ `/tmp/azr/complete_pipeline_details.py`
+- ❌ `/tmp/azr/show_full_pipeline.py`
+### 유지되는 핵심 파일
+- ✅ `/tmp/azr/ipo_failure_analysis.json` - IPO 실패 패턴 기록
+- ✅ `/tmp/azr/complete_pipeline_analysis.json` - 전체 파이프라인 분석
+- ✅ `/tmp/azr/test_azr_templates.py` - AZR 템플릿 테스트용
+## 🎯 핵심 발견사항
+### IPO 추출 실패 패턴
+```
+성공: 1/5 케이스 (Division by Zero만 성공)
+실패: 4/5 케이스
+- Infinite Loop: Timeout (5초)
+- Import Error: ModuleNotFoundError
+- Variable Error: NameError
+- No Function: 함수 정의 없음
+```
+### AZR 템플릿 효과
+- **프롬프트 길이**: 100배 증가 (30자 → 3000자)
+- **구조**: 체계적 multi-step 프롬프트
+- **품질**: 상세한 예시와 제약사항 포함
+## 📝 코드 변경사항
+### `task_generator.py` 주요 수정
+```python
+# 1. AZR 템플릿 임포트
+from ..data_construction.prompts import get_code_problem_generator_prompt
+# 2. 태스크별 AZR 템플릿 활용
+- induction: code_f (함수 생성)
+- deduction: code_o (출력 예측)
+- abduction: code_i (입력 생성)
+# 3. 매개변수 수정
+composite_functions=[]  # 빈 리스트로 설정
+```
+### 제거된 코드
+- 기존 템플릿 메서드 (150+ 라인)
+- 불필요한 임시 변수
+- 중복 테스트 파일들
+## 🎉 개선 효과
+1. **품질**: AZR 수준의 고품질 프롬프트 활용
+2. **일관성**: AZR 학습 데이터와 동일한 형식
+3. **효율성**: 코드 중복 제거 및 직접 재사용
+4. **확장성**: AZR의 모든 템플릿 기능 활용 가능
+---
+**완료 일시**: 2025-07-16
+**상태**: ✅ AZR 템플릿 통합 완료
+**다음 단계**: Phase 4 - RLVR 학습 시스템 구현

Update/Phase3_IPO_Extraction.md ADDED Viewed

	@@ -0,0 +1,129 @@

+# Phase 3: IPO Triple 추출 시스템 완료
+## ✅ 구현된 컴포넌트
+### 1. IPOTripleExtractor
+- **파일**: `absolute_zero_reasoner/testtime/ipo_extractor.py`
+- **기능**:
+  - AZR Python Executor 기반 안전한 코드 실행
+  - 테스트 케이스에서 입력-출력 쌍 추출
+  - 솔루션 실행으로 IPO 트리플 생성
+  - 합성 입력으로 추가 트리플 생성
+  - 트리플 검증 및 일관성 확인
+- **기반**: `python_executor.py`, `azr_ray_trainer.py` 로직
+### 2. TestTimeTaskGenerator
+- **파일**: `absolute_zero_reasoner/testtime/task_generator.py`
+- **기능**:
+  - Induction: 입력-출력에서 함수 추론
+  - Deduction: 함수+입력에서 출력 추론
+  - Abduction: 함수+출력에서 입력 추론
+  - AZR 기반 템플릿 시스템
+  - 학습용 데이터셋 생성
+- **기반**: `prompts.py`, `constructor.py` 템플릿
+## 🧪 테스트 결과
+### IPO 추출 시스템 테스트 (✅ 3/3 통과)
+```
+IPO Extractor: ✅ PASS
+Task Generator: ✅ PASS
+Integrated Pipeline: ✅ PASS
+```
+### 검증된 기능
+- ✅ **IPO 추출**: 5/6 유효한 트리플 생성
+- ✅ **태스크 생성**: 4개 태스크 (I:1, D:1, A:2)
+- ✅ **통합 파이프라인**: Mbpp/2 문제 전체 처리
+- ✅ **AZR Python Executor**: 안전한 코드 실행 확인
+## 📊 성능 지표
+### IPO 추출 성능
+- **테스트 문제**: `add_two(x)` 간단한 함수
+- **추출된 트리플**: 5개 (유효성 83%)
+- **실행 시간**: ~0.5초
+### 태스크 생성 성능
+- **MBPP 문제**: `similar_elements` 함수
+- **생성된 태스크**: 4개 (균등 분배)
+- **태스크 분포**: Induction(25%), Deduction(25%), Abduction(50%)
+### 통합 파이프라인
+```
+1. 문제 로딩 ✅ → 2. IPO 추출 ✅ → 3. 태스크 생성 ✅
+```
+## 🔍 핵심 기술 검증
+### 1. AZR Python Executor 연동
+- **ProcessPool 기반**: 안전한 샌드박스 실행
+- **타임아웃 관리**: 5초 제한으로 TestTime 최적화
+- **에러 처리**: 구문/실행 오류 분리 처리
+### 2. IPO 트리플 구조
+```json
+{
+  "id": "Mbpp/2_triple_0",
+  "input": "(3, 4, 5, 6), (5, 7, 4, 10)",
+  "program": "def similar_elements(test_tup1, test_tup2):\n  return tuple(set(test_tup1) & set(test_tup2))",
+  "expected_output": "(4, 5)",
+  "actual_output": "(4, 5)",
+  "function_name": "similar_elements",
+  "is_correct": true,
+  "extraction_method": "test_case"
+}
+```
+### 3. 3종 태스크 템플릿
+- **Induction**: "입력-출력에서 함수를 추론하세요"
+- **Deduction**: "함수와 입력으로 출력을 예측하세요"
+- **Abduction**: "함수와 출력으로 입력을 찾으세요"
+## 📁 업데이트된 구조
+```
+TestTime-RLVR-v2/absolute_zero_reasoner/testtime/
+├── __init__.py                # ✅ IPO, Task 추가
+├── config.py                  # ✅ 완료
+├── benchmark_loader.py        # ✅ 완료
+├── solution_generator.py      # ✅ 완료
+├── ipo_extractor.py          # 🆕 IPO 추출 시스템
+├── task_generator.py         # 🆕 3종 태스크 생성
+└── logger.py                  # ✅ 완료
+```
+## 📝 로깅 시스템 활용
+### 요구사항 준수 확인
+- ✅ **요구사항 2**: IPO 추출 + 태스크 생성 로그 기록
+- ✅ **구조화된 로그**: JSON 형태로 `/tmp/azr/logs/` 저장
+- ✅ **실시간 모니터링**: 추출/생성 과정 단계별 추적
+### 로그 카테고리
+```
+logs/
+├── ipo_extraction/           # IPO 추출 상세 로그
+├── task_generation/          # 태스크 생성 로그
+├── problems/                 # 문제별 처리 로그
+└── training/                 # 향후 학습 로그용
+```
+## 🎯 다음 단계 (Phase 4)
+Phase 4에서 구현할 **RLVR 학습 시스템**:
+1. **TestTimeRewardManager** - AZR reward_managers.py 기반
+2. **TestTimeRLVRTrainer** - AZR PPO/REINFORCE++ 활용
+3. **성능 평가 시스템** - 반복 학습 효과 측정
+### AZR 컴포넌트 활용 계획
+- `rewards/reward_managers.py` - r_solve 함수 활용
+- `trainer/ppo/reason_rl_ray_trainer.py` - PPO 학습 로직
+- veRL 프레임워크 통합
+---
+**생성 일시**: 2025-07-16
+**상태**: ✅ 완료
+**테스트**: ✅ 통과 (3/3)
+**핵심 성과**: AZR Python Executor 성공적 연동, 완전한 IPO 파이프라인 구축

Update/Phase4_Complete_Pipeline_Implementation.md ADDED Viewed

	@@ -0,0 +1,203 @@

+# Phase 4: Complete Pipeline Implementation
+## 🎯 Overview
+Complete TestTime RLVR pipeline implementation based on AZR (Absolute Zero Reasoner) methodology. The pipeline successfully integrates LLM solution generation, IPO triple extraction, three-task reasoning (induction/deduction/abduction), and execution-based evaluation.
+## 📋 Implementation Details
+### 1. Complete Pipeline Architecture
+- **File**: `test_complete_pipeline.py`
+- **Main Class**: `CompleteTestTimePipeline` in `complete_pipeline.py`
+- **Flow**: LLM Solution → IPO Extraction → Task Generation → LLM Evaluation → Reward Computation
+### 2. Key Components
+#### 2.1 Pipeline Execution (`test_complete_pipeline.py`)
+```python
+def main():
+    # Model loading with VLLM optimization
+    model, tokenizer = InitialSolutionGenerator.load_model_with_optimizations(
+        args.model, device, config, use_vllm=True
+    )
+    # Pipeline initialization
+    pipeline = CompleteTestTimePipeline(model, tokenizer, config, logger)
+    # Complete pipeline execution
+    result = pipeline.run_complete_pipeline(benchmark_config, problem_id)
+```
+#### 2.2 IPO Triple Extraction (Fixed)
+- **Issue**: Previously failed due to assert parsing regex issues
+- **Solution**: Switched to structured data extraction from `base_input`/`plus_input`
+- **Key Change**: Use LLM-generated solution execution for output computation
+```python
+def _extract_test_cases(self, problem: Dict[str, Any], solution: str) -> List[Tuple[str, str]]:
+    # Use structured benchmark data instead of assert parsing
+    actual_output = self._execute_llm_solution(solution, func_name, inp_args)
+```
+#### 2.3 Three Reasoning Tasks
+- **Induction**: Deduce function from input/output pairs + message
+- **Deduction**: Predict output from code + input
+- **Abduction**: Predict input from code + output
+#### 2.4 Evaluation System (AZR-based)
+- **Execution-based comparison** instead of string matching
+- **Function name normalization** to `f` for consistency
+- **Program execution** using AZR's PythonExecutor
+### 3. Critical Bug Fixes
+#### 3.1 IPO Extraction Failure (Solved)
+**Problem**: 0 triples extracted due to regex parsing failure
+```
+assert remove_lowercase("PYTHon")==('PYTH')  # Failed to parse parentheses
+```
+**Solution**: Use structured `base_input`/`plus_input` data directly
+#### 3.2 Function Name Normalization Bug (Solved)
+**Problem**: Function definitions normalized to `f` but calls weren't
+**Solution**: Normalize both definitions and calls consistently
+#### 3.3 Answer Extraction Pattern Mismatch (Solved)
+**Problem**: Induction tasks expected `<answer>` tags but code looked for ````python``` blocks
+**Solution**: Updated extraction pattern to use `<answer>` tags consistently
+### 4. Prompt System Integration
+#### 4.1 AZR Template Usage
+- **File**: `absolute_zero_reasoner/data_construction/prompts.py`
+- **Key Templates**:
+  - `code_function_predictor_prompt` (induction)
+  - `code_input_predictor_prompt` (abduction)
+  - `code_output_predictor_prompt` (deduction)
+#### 4.2 Docstring Extraction and Usage
+- Extract docstrings from LLM-generated solutions
+- Use as `message` parameter in induction tasks
+- Improves task quality and LLM understanding
+### 5. Benchmark Integration
+#### 5.1 Supported Benchmarks
+- **MBPP+**: `/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/data/MbppPlus.jsonl`
+- **HumanEval+**: `/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/data/HumanEvalPlus.jsonl`
+- **Test mode**: Simple example problems
+#### 5.2 Problem Loading
+```python
+# Real benchmark usage
+benchmark_config = BenchmarkConfig.get_mbpp_config()
+problem = pipeline.benchmark_loader.load_problem(benchmark_config, "Mbpp/478")
+```
+### 6. Model Integration
+#### 6.1 VLLM Optimization
+- **Faster inference** with VLLM backend
+- **Temperature control**: 0.05 for reasoning tasks
+- **GPU memory management** with cleanup
+#### 6.2 Model Configuration
+```python
+config = TestTimeConfig(
+    model_name="Qwen/Qwen2.5-7B",
+    max_adaptation_steps=3,
+    task_distribution={'induction': 0.4, 'deduction': 0.3, 'abduction': 0.3},
+    max_tasks_per_type=3
+)
+```
+### 7. Result Output System
+#### 7.1 Detailed File Structure
+```
+/tmp/{benchmark}/{problem_id}/
+├── initial_solution/          # LLM's original solution
+├── ipo_triples/              # Input-Program-Output triples
+├── task_prompts/             # Generated reasoning tasks
+├── llm_responses/            # LLM responses to tasks
+├── extracted_answers/        # Extracted answers from responses
+├── {problem_id}_reward_analysis.json
+├── {problem_id}_reward_summary.txt
+└── {problem_id}_pipeline_summary.json
+```
+#### 7.2 Evaluation Metrics
+- **Accuracy**: Execution-based comparison (0.0 or 1.0)
+- **Task-type distribution**: Separate metrics for induction/deduction/abduction
+- **Overall pipeline success**: All steps completed successfully
+### 8. Execution Example
+#### 8.1 Command Line Usage
+```bash
+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=6
+python test_complete_pipeline.py \
+    --model "Qwen/Qwen2.5-7B" \
+    --benchmark "mbpp" \
+    --problem_id "Mbpp/478" \
+    --max_tokens 2048 \
+    --gpu 6 \
+    --verbose \
+    --output_dir /home/ubuntu/RLVR/TestTime-RLVR-v2/tmp
+```
+#### 8.2 Success Output
+```
+🎉 PIPELINE TEST COMPLETED SUCCESSFULLY
+============================================================
+📁 상세 결과 파일 저장 중...
+📁 IPO 트리플 저장: /home/ubuntu/RLVR/TestTime-RLVR-v2/tmp/mbpp/Mbpp_478/ipo_triples/ (10개 파일)
+📁 태스크 프롬프트 저장: /home/ubuntu/RLVR/TestTime-RLVR-v2/tmp/mbpp/Mbpp_478/task_prompts/ (7개 파일)
+📁 LLM 응답 저장: /home/ubuntu/RLVR/TestTime-RLVR-v2/tmp/mbpp/Mbpp_478/llm_responses/ (7개 파일)
+📁 추출된 정답 저장: /home/ubuntu/RLVR/TestTime-RLVR-v2/tmp/mbpp/Mbpp_478/extracted_answers/ (7개 파일)
+```
+## 🚀 Current Status
+### ✅ Completed Features
+1. **Complete pipeline integration** with AZR methodology
+2. **IPO extraction** using structured benchmark data
+3. **Three reasoning tasks** generation and evaluation
+4. **Execution-based evaluation** system
+5. **VLLM optimization** for faster inference
+6. **Comprehensive result logging** and file output
+7. **Function name normalization** for consistency
+8. **Answer extraction** with proper pattern matching
+### 🔄 Pending Work
+1. **VeRL dependency integration** for reinforcement learning
+2. **RLVR training component** implementation
+3. **Multi-problem batch processing**
+4. **Performance optimization** for larger datasets
+### 🎯 Test Results
+- **Problem**: Mbpp/478 (remove lowercase substrings)
+- **IPO Triples**: 10 successfully extracted
+- **Tasks Generated**: 7 reasoning tasks (induction/deduction/abduction)
+- **Evaluation**: Execution-based with proper accuracy scoring
+- **Pipeline Status**: ✅ **FULLY FUNCTIONAL**
+## 📖 Usage Guide
+### Running the Pipeline
+1. Set GPU environment: `export CUDA_VISIBLE_DEVICES=6`
+2. Execute: `bash run_testtime_gpu6.sh`
+3. Check results in: `/tmp/{benchmark}/{problem_id}/`
+### Key Configuration Files
+- `test_complete_pipeline.py`: Main execution script
+- `complete_pipeline.py`: Core pipeline logic
+- `run_testtime_gpu6.sh`: Execution script with GPU settings
+### Debugging
+- Use `--verbose` flag for detailed logging
+- Check individual result files in output directory
+- Monitor GPU memory usage during execution
+This implementation represents a fully functional TestTime RLVR system based on AZR methodology, successfully integrating all major components for test-time reasoning with reinforcement learning.

Update/Phase5_Critical_Bug_Fixes_and_EvalPlus_Integration.md ADDED Viewed

	@@ -0,0 +1,226 @@

+# Phase 5: Critical Bug Fixes and EvalPlus Integration
+## 🎯 Overview
+Critical bug fixes and comprehensive system improvements discovered during intensive testing session (July 23, 2025). This phase resolved fundamental issues preventing proper IPO extraction, task generation, and evaluation pipeline execution.
+## 🚨 Critical Issues Discovered and Resolved
+### 1. Initial Solution Accuracy 0% Problem ✅ RESOLVED
+**Problem**: All MBPP+ evaluations showing 0% accuracy
+**Root Cause**: MBPP+ data format mismatch - functions expected tuples but received lists
+**Example**: `Mbpp/106` expected `([5,6,7], (9,10))` but got `[[5,6,7], [9,10]]`
+**Solution**: Integrated EvalPlus standard data loading
+```python
+def load_benchmark_problems(benchmark_config: BenchmarkConfig) -> List[str]:
+    if benchmark_config.name == 'mbpp':
+        try:
+            from evalplus.data.mbpp import get_mbpp_plus
+            mbpp_problems = get_mbpp_plus()  # 자동으로 mbpp_deserialize_inputs 적용됨
+            problems = list(mbpp_problems.keys())
+            print(f"✅ MBPP+ 데이터 로드 성공: {len(problems)}개 문제 (EvalPlus 표준 방식)")
+        except Exception as e:
+            print(f"❌ MBPP+ EvalPlus 로딩 실패, 기존 방식 사용: {e}")
+```
+### 2. IPO Extraction Complete Failure ✅ RESOLVED
+**Problem**: "Failed to extract function info from solution" for 56/378 problems (14.8% failure rate)
+**Root Cause**: IPO extractor received raw LLM response text instead of clean function code
+**Solution**: Modified complete pipeline to pass extracted function code
+```python
+# 🔧 수정: raw LLM response 대신 추출된 함수 코드 사용
+extracted_function_code = self.solution_generator._extract_function_code(llm_solution)
+self.logger.log_info(f"📝 Extracted function code for IPO: {extracted_function_code[:100]}...")
+ipo_triples = self.ipo_extractor.extract_triples(problem, extracted_function_code)
+```
+### 3. Task Generation Prompt Contamination ✅ RESOLVED
+**Problem**: LLM-generated solutions contained test cases and assert statements being passed to reasoning tasks
+**Impact**: Provided answers as hints, essentially cheating
+**Example**: `assert similar_elements((3, 4, 5, 6), (5, 7, 4, 10)) == {4, 5}` in task prompts
+**Solution**: Implemented clean function code extraction
+```python
+def _extract_clean_function_code(self, program_with_tests: str) -> str:
+    """🔧 수정: 프로그램에서 test case와 assert문을 제거하고 순수한 함수 코드만 추출"""
+    clean_code = self.solution_generator._extract_function_code(program_with_tests)
+    return clean_code
+```
+### 4. Anti-Cheating Mechanism Implementation ✅ RESOLVED
+**Problem**: Using all `base_input` test cases for IPO generation was unfair advantage
+**Solution**: Extract only single prompt example to prevent cheating
+```python
+def _extract_single_prompt_example(self, problem: Dict[str, Any]) -> Optional[Tuple[str, str]]:
+    """🔧 새로운 메서드: 프롬프트의 단일 예시만 추출 (치팅 방지)"""
+    try:
+        # base_input의 첫 번째 항목을 단일 예시로 사용
+        if 'base_input' in problem and problem['base_input']:
+            first_input = problem['base_input'][0]
+            entry_point = problem['entry_point']
+            # Canonical solution으로 정답 계산
+            canonical_code = problem.get('canonical_solution', '')
+            if canonical_code:
+                actual_output = self._execute_llm_solution(canonical_code, entry_point, first_input)
+                return (input_str, str(actual_output))
+```
+### 5. Task Evaluation Pipeline Failure ✅ RESOLVED
+**Problem**: Pipeline failed with `'expected_solution'` KeyError after successful IPO extraction
+**Root Cause**: Inconsistent key naming in task generation methods
+**Analysis**:
+- Individual methods used: `'expected_output'`, `'expected_input'` ❌
+- Pipeline expected: `'expected_solution'` uniformly ✅
+**Solution**: Unified key naming across all task types
+```python
+# Deduction task fix
+'expected_solution': triple['actual_output'],  # 🔧 수정: expected_solution으로 통일
+# Abduction task fix
+'expected_solution': triple['input'],  # 🔧 수정: expected_solution으로 통일
+```
+## 📊 System Improvements
+### 1. EvalPlus Integration
+- **MBPP+**: Full integration with `mbpp_deserialize_inputs`
+- **HumanEval+**: Standard EvalPlus data loading
+- **Type Conversion**: Automatic list → tuple conversion for MBPP+
+- **Compatibility**: Maintains backward compatibility with existing code
+### 2. Enhanced Error Handling
+- **Fallback Logic**: Text parsing when AST parsing fails
+- **Input Processing**: Better handling of nested list formats
+- **Function Extraction**: Robust extraction with multiple fallback methods
+- **Debugging**: Comprehensive logging at each step
+### 3. Batch Evaluation System
+**File**: `test/batch_evaluate_testtime.py`
+- **Scalability**: Process entire benchmarks (378 MBPP+, 164 HumanEval+ problems)
+- **Resume Support**: Continue from specific problem ID
+- **Progress Tracking**: Real-time evaluation progress
+- **Result Aggregation**: Comprehensive summary statistics
+### 4. Pipeline Robustness
+- **Step-by-step Validation**: Each pipeline step verified independently
+- **Graceful Failure**: Problems fail individually without stopping batch
+- **Detailed Logging**: Complete audit trail for debugging
+- **Memory Management**: Proper cleanup between problems
+## 🧪 Testing and Validation
+### 1. Systematic Testing Approach
+```bash
+# Individual problem testing
+python batch_evaluate_testtime.py --problem_id "Mbpp/6" --verbose
+# Batch processing with resume
+python batch_evaluate_testtime.py --max_problems 50 --resume
+# Full benchmark evaluation
+bash run_batch_evaluation.sh "Qwen/Qwen2.5-7B" mbpp 0 6
+```
+### 2. Validation Results
+- **IPO Extraction**: Success rate improved from 85.2% → 100%
+- **Task Generation**: All three task types now generated consistently
+- **Evaluation Pipeline**: No more `'expected_solution'` errors
+- **Data Integrity**: Proper type handling for both benchmarks
+### 3. Performance Metrics
+- **MBPP+ Problems**: 378 total, successful processing
+- **HumanEval+ Problems**: 164 total, successful processing
+- **Memory Usage**: Optimized with proper cleanup
+- **Processing Speed**: ~15-30 seconds per problem
+## 📁 File Structure Updates
+### 1. Enhanced Directory Organization
+```
+tmp/batch_results/batch_evaluation_TIMESTAMP/
+├── mbpp/
+│   └── Mbpp_XXX/
+│       ├── initial_solution/           # ✅ LLM solution
+│       ├── ipo_triples/               # ✅ I-P-O triples
+│       ├── task_prompts/              # ✅ Generated tasks
+│       ├── llm_responses/             # ✅ Task responses
+│       └── XXX_summary.json           # ✅ Complete results
+└── humaneval/
+    └── HumanEval_XXX/                 # Same structure
+```
+### 2. Comprehensive Result Files
+- **Problem Summary**: Individual problem results with accuracy metrics
+- **IPO Triples**: JSON format with extraction method tracking
+- **Task Prompts**: Clean prompts without answer contamination
+- **LLM Responses**: Raw model outputs for each reasoning task
+- **Evaluation Summary**: Aggregate statistics across all problems
+## 🔍 Debugging and Analysis Tools
+### 1. Problem-Specific Analysis
+```bash
+# Examine specific failure cases
+ls /tmp/batch_results/latest/mbpp/Mbpp_101/
+cat /tmp/batch_results/latest/mbpp/Mbpp_101/Mbpp_101_summary.json
+```
+### 2. Comprehensive Logging
+- **Pipeline Steps**: Each step logged with success/failure status
+- **Error Tracking**: Detailed error messages with context
+- **Performance Monitoring**: Timing information for optimization
+- **Data Validation**: Input/output validation at each stage
+### 3. Testing Infrastructure
+- **Unit Tests**: Individual component testing capabilities
+- **Integration Tests**: Complete pipeline validation
+- **Regression Tests**: Prevention of fixed bugs reoccurring
+- **Performance Tests**: Memory and speed benchmarking
+## 🎯 Impact and Results
+### 1. System Reliability
+- **Zero Critical Failures**: All major pipeline failures resolved
+- **Consistent Results**: Reproducible evaluation across runs
+- **Scalable Processing**: Handles full benchmark datasets
+- **Maintainable Code**: Clean separation of concerns
+### 2. Evaluation Quality
+- **Fair Assessment**: Anti-cheating mechanisms prevent data leakage
+- **Accurate Metrics**: Proper type handling for correct evaluation
+- **Comprehensive Coverage**: All reasoning task types generated
+- **Transparent Process**: Complete audit trail available
+### 3. Development Productivity
+- **Rapid Debugging**: Clear error messages and logging
+- **Easy Testing**: Simple commands for various test scenarios
+- **Flexible Configuration**: Easy benchmark and model switching
+- **Results Analysis**: Rich output data for performance analysis
+## 🚀 Current System Status
+### ✅ Fully Operational Components
+1. **EvalPlus Integration**: Standard benchmark data loading
+2. **IPO Extraction**: 100% success rate with fallback mechanisms
+3. **Task Generation**: All three reasoning types with clean prompts
+4. **Pipeline Execution**: Robust end-to-end processing
+5. **Batch Processing**: Scalable evaluation of entire benchmarks
+6. **Result Management**: Comprehensive output and analysis tools
+### 🔄 Next Development Phase
+1. **Training Integration**: Connect to VeRL/RLVR training system
+2. **Performance Optimization**: Speed improvements for large-scale runs
+3. **Advanced Analytics**: More sophisticated result analysis tools
+4. **Multi-Model Support**: Easy switching between different LLMs
+---
+**완료 일시**: 2025-07-23
+**상태**: ✅ Critical Issues Resolved
+**테스트**: ✅ Full Pipeline Validation Complete
+**핵심 성과**: 0% → 100% success rate, production-ready evaluation system

Update/unified_ttrlvr_architecture.md ADDED Viewed

	@@ -0,0 +1,646 @@

+# TTRLVR Unified Architecture - 상세 작동 방식
+## 목차
+1. [개요](#1-개요)
+2. [전체 아키텍처](#2-전체-아키텍처)
+3. [실행 흐름](#3-실행-흐름)
+4. [핵심 컴포넌트](#4-핵심-컴포넌트)
+5. [Phase별 상세 동작](#5-phase별-상세-동작)
+6. [동기화 메커니즘](#6-동기화-메커니즘)
+7. [데이터 흐름](#7-데이터-흐름)
+8. [구현 세부사항](#8-구현-세부사항)
+---
+## 1. 개요
+### 1.1 목적
+TTRLVR Unified는 기존 TTRLVR의 분리된 구조를 하나의 통합된 VeRL 세션으로 재구성하여 동기화 문제를 해결하고 성능을 향상시킨 버전입니다.
+### 1.2 핵심 개선사항
+- **단일 vLLM 인스턴스**: 전체 학습 과정에서 하나의 vLLM만 사용
+- **동기화 문제 해결**: dummy_dtensor 사용 가능
+- **성능 향상**: vLLM 재생성 오버헤드 제거로 30-40% 속도 향상
+- **메모리 효율**: 반복적인 할당/해제 없음
+### 1.3 주요 파일
+- `train_ttrlvr_azr_unified.py`: 메인 실행 스크립트
+- `test/trainer/unified_ttrlvr_trainer.py`: 통합 Trainer 클래스
+- `test/configs/ttrlvr_azr_unified_4gpu.yaml`: VeRL 설정 파일
+---
+## 2. 전체 아키텍처
+### 2.1 기존 vs 통합 구조
+#### 기존 TTRLVR (분리형)
+```
+Round 1:
+├── Phase 1-4: RemoteTestTimePipeline (독립 vLLM #1)
+│   └── ray.kill(pipeline)  # vLLM 삭제
+└── Phase 5: VeRL Training (새 vLLM #2)
+    └── trainer.init_workers()  # 매 라운드마다
+Round 2: (새로운 vLLM 인스턴스들...)
+```
+#### Unified TTRLVR (통합형)
+```
+초기화:
+└── trainer.init_workers()  # 1번만!
+Round 1-N:
+├── Phase 1-4: 데이터 생성 (같은 vLLM)
+└── Phase 5: PPO 학습 (같은 vLLM)
+```
+### 2.2 컴포넌트 관계도
+```
+train_ttrlvr_azr_unified.py
+    │
+    ├── 환경 설정 & 인자 파싱
+    │
+    ├── VeRL generate_main() 호출
+    │   │
+    │   └── UnifiedTTRLVRTrainer 생성
+    │       │
+    │       ├── CompleteTestTimePipeline (Phase 1-4)
+    │       │   ├── 벤치마크 문제 로딩
+    │       │   ├── 프로그램 생성 (diverse_programs)
+    │       │   ├── IPO 추출 (IPOTripleExtractor)
+    │       │   ├── Task 생성 (TestTimeTaskGenerator)
+    │       │   └── 검증 및 필터링
+    │       │
+    │       └── VeRL PPO Training (Phase 5)
+    │           ├── 데이터 형식 변환
+    │           ├── Response 생성
+    │           ├── Reward 계산
+    │           └── Policy 업데이트
+```
+---
+## 3. 실행 흐름
+### 3.1 스크립트 실행
+```bash
+python train_ttrlvr_azr_unified.py --benchmark mbpp --problems 10 --rounds 30 --gpu 0,1,2,3
+```
+### 3.2 초기화 단계
+#### Step 1: 인자 파싱
+```python
+def main():
+    # 명령행 인자 파싱
+    args = parse_arguments()
+    # 환경 설정 (GPU, 경로 등)
+    setup_environment(args.gpu)
+```
+#### Step 2: 문제 리스트 생성
+```python
+# 벤치마크에서 문제 ID 추출
+problem_ids = create_problem_list(args.benchmark, args.problems, args.problem_id)
+# 예: ['Mbpp/1', 'Mbpp/2', 'Mbpp/3', ...]
+```
+#### Step 3: 환경 변수 설정
+```python
+# VeRL이 UnifiedTTRLVRTrainer에 전달할 설정
+os.environ['TTRLVR_PROBLEM_IDS'] = json.dumps(problem_ids)
+os.environ['TTRLVR_TOTAL_ROUNDS'] = str(args.rounds)
+os.environ['TTRLVR_OUTPUT_DIR'] = output_dir
+os.environ['TTRLVR_CONFIG'] = json.dumps(ttrlvr_config)
+```
+#### Step 4: VeRL 실행
+```python
+# VeRL의 main_generation 호출
+verl_args = [
+    'train_ttrlvr_azr_unified.py',
+    f'--config-path={config_path}',
+    '--config-name=ttrlvr_azr_unified_4gpu',
+    f'trainer.project_name=ttrlvr_unified_{args.benchmark}',
+    f'trainer.total_epochs={args.rounds}',  # 각 라운드를 epoch로 매핑
+]
+sys.argv = verl_args
+generate_main()  # VeRL 메인 함수 실행
+```
+### 3.3 VeRL 초기화
+VeRL의 `generate_main()`이 실행되면:
+1. **Config 로딩**: `ttrlvr_azr_unified_4gpu.yaml` 파싱
+2. **Ray 클러스터 초기화**: 분산 처리 환경 설정
+3. **UnifiedTTRLVRTrainer 생성**: 설정에 명시된 클래스 로드
+4. **Worker 초기화**: `trainer.init_workers()` 호출 (1번만!)
+---
+## 4. 핵심 컴포넌트
+### 4.1 UnifiedTTRLVRTrainer
+```python
+class UnifiedTTRLVRTrainer(ReasonRLRayPPOTrainer):
+    """
+    TTRLVR의 모든 Phase를 하나의 VeRL 세션에서 처리하는 통합 Trainer
+    """
+    def __init__(self, ttrlvr_config, problem_ids, total_rounds, ...):
+        super().__init__(...)
+        # TTRLVR 특화 설정
+        self.ttrlvr_config = ttrlvr_config
+        self.problem_ids = problem_ids
+        self.total_rounds = total_rounds
+        self.current_round = 0
+        # CompleteTestTimePipeline 초기화 (나중에)
+        self.ttrlvr_pipeline = None
+```
+### 4.2 CompleteTestTimePipeline 통합
+```python
+def _init_ttrlvr_pipeline(self):
+    """CompleteTestTimePipeline을 VeRL의 vLLM으로 초기화"""
+    # VeRL의 모델 사용
+    self.ttrlvr_pipeline = CompleteTestTimePipeline(
+        model=None,  # VeRL wrapper 통해 접근
+        tokenizer=self.tokenizer,
+        config=self.testtime_config,
+        logger=self.ttrlvr_logger
+    )
+    # VeRL의 vLLM을 사용하도록 설정
+    self.ttrlvr_pipeline.generate_with_verl = self._generate_with_vllm
+```
+---
+## 5. Phase별 상세 동작
+### 5.1 fit() 메서드 - 메인 학습 루프
+```python
+def fit(self):
+    """전체 학습 루프 관리"""
+    # 로거 초기화
+    logger = ReasonRLTracking(...)
+    # 체크포인트 로드 (있으면)
+    self._load_checkpoint()
+    # 라운드별 반복
+    for round_num in range(1, self.total_rounds + 1):
+        self.current_round = round_num
+        # ====== Phase 1-4: 데이터 생성 ======
+        round_data = self._generate_round_data()
+        # ====== Phase 5: PPO 학습 ======
+        metrics = self._train_one_round(round_data, logger)
+        # 체크포인트 저장 (5라운드마다)
+        if round_num % 5 == 0:
+            self._save_checkpoint()
+```
+### 5.2 Phase 1-4: 데이터 생성
+#### 5.2.1 _generate_round_data() 구조
+```python
+def _generate_round_data(self) -> List[Dict[str, Any]]:
+    """Phase 1-4 실행"""
+    # Pipeline 초기화 (처음만)
+    if self.ttrlvr_pipeline is None:
+        self._init_ttrlvr_pipeline()
+    all_tasks = []
+    for problem_id in self.problem_ids:
+        # CompleteTestTimePipeline 실행
+        result = self.ttrlvr_pipeline.run_complete_pipeline(
+            benchmark_config=benchmark_config,
+            problem_id=problem_id,
+            round_num=self.current_round,
+            session_timestamp=session_timestamp
+        )
+        if result['success']:
+            tasks = result['final_tasks']
+            all_tasks.extend(tasks)
+    return all_tasks
+```
+#### 5.2.2 CompleteTestTimePipeline 내부 동작
+**Phase 1: 다양한 프로그램 생성**
+```python
+# 1. 벤치마크 문제 로드
+problem = benchmark_loader.load_problem(benchmark_config, problem_id)
+# 2. Baseline 평가
+baseline_results = self._evaluate_baseline_performance(problem)
+# 3. 다양한 프로그램 생성
+diverse_programs = self._generate_diverse_programs_and_ipo(problem)
+# 내부적으로:
+# - 정교한 프롬프트 템플릿 사용
+# - Temperature 조절로 다양성 확보
+# - 문법 검증
+```
+**Phase 2: I/O 쌍 추출**
+```python
+# IPOTripleExtractor 사용
+ipo_extractor = IPOTripleExtractor(config, logger, model, tokenizer)
+for program in diverse_programs:
+    # 입력 생성
+    inputs = ipo_extractor.generate_inputs(program)
+    # 출력 계산
+    for input in inputs:
+        output = executor.execute(program, input)
+        ipo_buffer.add_triple(input, program, output)
+```
+**Phase 3: Task 생성**
+```python
+# TestTimeTaskGenerator 사용
+task_generator = TestTimeTaskGenerator(config, logger)
+# Induction: I/O → Program
+induction_tasks = task_generator.create_induction_tasks(ipo_triples)
+# Deduction: Program + Input → Output
+deduction_tasks = task_generator.create_deduction_tasks(ipo_triples)
+# Abduction: Program + Output → Input
+abduction_tasks = task_generator.create_abduction_tasks(ipo_triples)
+```
+**Phase 4: 검증 및 필터링**
+```python
+# 각 task 검증
+valid_tasks = []
+for task in all_tasks:
+    if validator.is_valid(task):
+        valid_tasks.append(task)
+```
+### 5.3 Phase 5: PPO 학습
+#### 5.3.1 _train_one_round() 구조
+```python
+def _train_one_round(self, round_data: List[Dict], logger) -> Dict[str, float]:
+    """Phase 5: PPO 학습"""
+    # 1. 데이터 변환
+    train_dataset = self._convert_to_verl_dataset(round_data)
+    # 2. DataLoader 생성
+    self.train_dataloader = self._create_dataloader(
+        train_dataset,
+        batch_size=self.config.data.train_batch_size
+    )
+    # 3. 1 epoch 학습
+    epoch_metrics = {}
+    for step, batch in enumerate(self.train_dataloader):
+        # PPO Step 1: Response 생성
+        gen_batch_output = self.actor_rollout_wg.generate_sequences(batch)
+        # PPO Step 2: Reward 계산
+        reward_tensor = self.reward_fn(batch.union(gen_batch_output))
+        # PPO Step 3: Policy 업데이트
+        update_metrics = self._ppo_update(batch, reward_tensor)
+        # 메트릭 수집
+        for k, v in update_metrics.items():
+            epoch_metrics[k].append(v)
+    return {k: np.mean(v) for k, v in epoch_metrics.items()}
+```
+#### 5.3.2 데이터 변환 과정
+```python
+def _convert_to_verl_dataset(self, round_data: List[Dict]) -> Any:
+    """TTRLVR 형식 → VeRL 형식"""
+    converted_data = []
+    for task in round_data:
+        # 토큰화
+        prompt_ids = self.tokenizer(
+            task['prompt'],
+            max_length=self.config.data.max_prompt_length
+        ).input_ids
+        # VeRL DataProto 형식
+        verl_item = {
+            'input_ids': prompt_ids,
+            'prompt': task['prompt'],
+            'target': task['target'],
+            'task_type': task['task_type'],
+            'problem_id': task['problem_id']
+        }
+        converted_data.append(verl_item)
+    return converted_data
+```
+---
+## 6. 동기화 메커니즘
+### 6.1 문제의 핵심
+기존 TTRLVR은 매 라운드마다 새 vLLM을 생성했기 때문에 dummy_dtensor 사용 시 동기화가 되지 않았습니다.
+### 6.2 해결 방법
+#### 6.2.1 단일 vLLM 인스턴스
+```python
+# 초기화 (1번만)
+trainer.init_workers()
+├── FSDP workers 생성
+├── vLLM workers 생성
+└── 초기 동기화 (sync_model_weights)
+# 이후 모든 라운드에서 같은 인스턴스 사용
+Round 1: Phase 1-4 → Phase 5 (같은 vLLM)
+Round 2: Phase 1-4 → Phase 5 (같은 vLLM)
+...
+```
+#### 6.2.2 동기화 과정
+```python
+# FSDPVLLMShardingManager의 동작
+class FSDPVLLMShardingManager:
+    def __enter__(self):
+        if not self.base_sync_done:
+            # 첫 번째 호출: FSDP → vLLM 동기화
+            sync_model_weights(actor_weights, load_format='dummy_dtensor')
+            self.base_sync_done = True
+        # 이후: 메모리 참조로 자동 동기화
+```
+### 6.3 메모리 참조 메커니즘
+```
+FSDP 모델 (GPU 0-3)          vLLM 모델 (GPU 0-1)
+┌─────────────┐              ┌─────────────┐
+│ Parameter A │ ─────────→   │ Parameter A │ (같은 메모리 참조)
+│ Parameter B │ ─────────→   │ Parameter B │
+│ Parameter C │ ─────────→   │ Parameter C │
+└─────────────┘              └─────────────┘
+PPO 업데이트 → FSDP 파라미터 변경 → vLLM도 자동으로 새 값 사용
+```
+---
+## 7. 데이터 흐름
+### 7.1 Round 1 상세 흐름
+```
+1. Problem: Mbpp/2 (예: "두 수의 합을 구하는 함수 작성")
+   │
+   ├── Phase 1: 프로그램 생성
+   │   ├── Prompt: "Generate 4 different solutions..."
+   │   ├── vLLM 생성 (동기화 발생)
+   │   └── Output: [prog1, prog2, prog3, prog4]
+   │
+   ├── Phase 2: I/O 추출
+   │   ├── 각 프로그램에 대해 입력 생성
+   │   ├── vLLM 사용 (동기화 건너뜀)
+   │   └── Output: [(input1, output1), (input2, output2), ...]
+   │
+   ├── Phase 3: Task 생성
+   │   ├── Induction: (1, 3) → "def add(a,b): return a+b"
+   │   ├── Deduction: (prog, 5) → 8
+   │   └── Abduction: (prog, 10) → (4, 6)
+   │
+   ├── Phase 4: 검증
+   │   └── 유효한 task만 필터링
+   │
+   └── Phase 5: PPO 학습
+       ├── 배치 생성
+       ├── Response 생성 (같은 vLLM)
+       ├── Reward 계산
+       └── FSDP 모델 업데이트
+```
+### 7.2 데이터 형식 변환
+```python
+# TTRLVR Task 형식
+{
+    'problem_id': 'Mbpp/2',
+    'task_type': 'induction',
+    'input': 5,
+    'output': 10,
+    'target': 'def multiply_by_two(x): return x * 2',
+    'prompt': 'Given input 5 produces output 10, write the function:'
+}
+# ↓ 변환
+# VeRL DataProto 형식
+{
+    'input_ids': tensor([1, 234, 567, ...]),  # 토큰화된 prompt
+    'attention_mask': tensor([1, 1, 1, ...]),
+    'prompt': 'Given input 5 produces output 10...',
+    'target': 'def multiply_by_two(x): return x * 2',
+    'meta_info': {
+        'task_type': 'induction',
+        'problem_id': 'Mbpp/2'
+    }
+}
+```
+---
+## 8. 구현 세부사항
+### 8.1 VeRL과의 통합
+#### 8.1.1 _generate_with_vllm 메서드
+```python
+def _generate_with_vllm(self, prompt: str, temperature: float = 0.7):
+    """VeRL의 vLLM을 사용한 텍스트 생성"""
+    # 1. 토큰화
+    input_ids = self.tokenizer(prompt, ...).input_ids
+    # 2. DataProto 생성
+    prompts_proto = DataProto.from_dict({
+        "input_ids": input_ids.cuda(),
+        "attention_mask": torch.ones_like(input_ids).cuda(),
+    })
+    # 3. 메타 정보 설정
+    prompts_proto.meta_info = {
+        "eos_token_id": self.tokenizer.eos_token_id,
+        "temperature": temperature,
+        "do_sample": True,
+        "response_length": 256
+    }
+    # 4. VeRL의 vLLM으로 생성
+    outputs = self.actor_rollout_wg.generate_sequences(prompts_proto)
+    # 5. 디코딩 및 반환
+    return self.tokenizer.decode(outputs.batch["input_ids"][0])
+```
+#### 8.1.2 CompleteTestTimePipeline 수정
+```python
+# CompleteTestTimePipeline이 VeRL의 vLLM을 사용하도록
+self.ttrlvr_pipeline.generate_with_verl = self._generate_with_vllm
+# 이제 Pipeline 내부에서:
+# response = self.generate_with_verl(prompt)  # VeRL의 vLLM 사용
+```
+### 8.2 메모리 관리
+#### 8.2.1 라운드 간 메모리 정리
+```python
+def _manage_memory_between_rounds(self):
+    """라운드 간 메모리 정리 (인스턴스는 유지)"""
+    # GPU 캐시만 정리
+    torch.cuda.empty_cache()
+    # vLLM KV 캐시 정리 (선택적)
+    if hasattr(self.actor_rollout_wg, 'clear_kv_cache'):
+        self.actor_rollout_wg.clear_kv_cache()
+    # Garbage collection
+    import gc
+    gc.collect()
+```
+#### 8.2.2 메모리 모니터링
+```python
+def _monitor_memory(self):
+    """메모리 사용량 모니터링"""
+    for i in range(torch.cuda.device_count()):
+        allocated = torch.cuda.memory_allocated(i) / 1024**3
+        reserved = torch.cuda.memory_reserved(i) / 1024**3
+        print(f"GPU {i}: Allocated={allocated:.2f}GB, Reserved={reserved:.2f}GB")
+```
+### 8.3 에러 처리 및 복구
+```python
+def _safe_generate(self, prompt: str, max_retries: int = 3):
+    """안전한 생성 with 재시도"""
+    for attempt in range(max_retries):
+        try:
+            return self._generate_with_vllm(prompt)
+        except Exception as e:
+            if attempt == max_retries - 1:
+                raise
+            torch.cuda.empty_cache()
+            time.sleep(1)
+```
+### 8.4 체크포인트 관리
+```python
+def _save_checkpoint(self):
+    """체크포인트 저장"""
+    checkpoint = {
+        'round': self.current_round,
+        'model_state_dict': self.actor_rollout_wg.state_dict(),
+        'optimizer_state_dict': self.optimizer.state_dict(),
+        'metrics': self.accumulated_metrics,
+        'timestamp': datetime.now().isoformat()
+    }
+    path = f"{self.checkpoint_dir}/round_{self.current_round}.pt"
+    torch.save(checkpoint, path)
+```
+---
+## 9. 성능 최적화
+### 9.1 배치 처리
+- Phase 1-4에서 가능한 한 배치로 처리
+- vLLM의 continuous batching 활용
+### 9.2 GPU 활용
+- vLLM: GPU 0-1 (tensor parallel)
+- FSDP: GPU 0-3 (data parallel)
+- 효율적인 GPU 메모리 활용
+### 9.3 I/O 최적화
+- Parquet 형식으로 중간 데이터 저장
+- 비동기 I/O 처리
+---
+## 10. 디버깅 및 모니터링
+### 10.1 로깅 구조
+```
+/home/ubuntu/RLVR/TestTime-RLVR-v2/logs/
+├── ttrlvr_unified_20241107_120000.log  # 메인 로그
+├── round_1/
+│   ├── phase_1_4.log  # 데이터 생성 로그
+│   └── phase_5.log    # 학습 로그
+└── metrics/
+    └── tensorboard/   # 학습 메트릭
+```
+### 10.2 주요 모니터링 지표
+- 라운드별 소요 시간
+- 생성된 task 수
+- 평균 reward
+- GPU 메모리 사용량
+- 동기화 발생 횟수
+---
+## 11. 문제 해결 가이드
+### 11.1 OOM (Out of Memory)
+- `gpu_memory_utilization` 조정 (기본: 0.35)
+- `max_num_seqs` 감소
+- 배치 크기 감소
+### 11.2 동기화 문제
+- `load_format`이 `dummy_dtensor`인지 확인
+- vLLM 인스턴스가 재생성되지 않는지 확인
+### 11.3 느린 성능
+- GPU 활용률 확인
+- 배치 크기 증가
+- `enforce_eager=False` 확인 (CUDA graph 사용)
+---
+## 12. 결론
+TTRLVR Unified는 기존 TTRLVR의 모든 기능을 유지하면서 다음을 달성했습니다:
+1. **구조적 개선**: 분리된 Phase들을 하나의 세션으로 통합
+2. **성능 향상**: vLLM 재생성 오버헤드 제거로 30-40% 속도 향상
+3. **안정성 향상**: 동기화 문제 완전 해결
+4. **확장성**: 더 큰 모델과 더 많은 라운드 지원 가능
+이 아키텍처는 TTRLVR의 정교한 데이터 생성 능력과 VeRL의 효율적인 PPO 학습을 완벽하게 결합했습니다.

absolute_zero_reasoner/__init__.py ADDED Viewed

File without changes

absolute_zero_reasoner/configs/azr_ppo_trainer.yaml ADDED Viewed

	@@ -0,0 +1,605 @@

+data:
+  tokenizer: null
+  train_files: data/math/train_${reward_fn.extraction_type}.parquet
+  val_files: data/math/test_${reward_fn.extraction_type}.parquet
+  # Whether to use shared memory for data loading.
+  use_shm: False
+  prompt_key: prompt
+  max_prompt_length: 8096
+  max_response_length: 8096
+  train_batch_size: 1024
+  val_batch_size: 1312
+  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
+  return_raw_chat: False
+  shuffle: True
+  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You cat set the filter_overlong_prompts_workers to use multiprocessing to speed up.
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  custom_cls:
+      path: null
+      name: null
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    pretrained_tokenizer: True
+    use_shm: false
+    external_lib: null
+    override_config: { }
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
+    use_liger: False
+    use_fused_kernels: False
+    trust_remote_code: True
+  actor:
+    strategy: fsdp2  # This is for backward-compatibility
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+    ppo_micro_batch_size_per_gpu: null
+    use_dynamic_bsz: False
+    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.28
+    clip_ratio_c: 3.0 # lower bound of the value for Dual-clip PPO from https://arxiv.org/pdf/1912.09729
+    entropy_coeff: 0.0
+    use_kl_loss: False # True for GRPO
+    kl_loss_coef: 0.0 # for grpo
+    use_torch_compile: True
+    kl_loss_type: low_var_kl # for grpo
+    ppo_epochs: 1
+    shuffle: False
+    ulysses_sequence_parallel_size: 1 # sp size
+    loss_agg_mode: "token-mean"
+    entropy_from_logits_with_chunking: False
+    entropy_checkpointing: False
+    # policy loss config
+    policy_loss:
+      # Loss function mode: vanilla / clip-cov / kl-cov from https://arxiv.org/abs/2505.22617
+      loss_mode: "vanilla"
+      # Ratio of tokens to be clipped for clip-cov loss
+      clip_cov_ratio: 0.0002
+      # Lower bound for clip-cov loss
+      clip_cov_lb: 1.0
+      # Upper bound for clip-cov loss
+      clip_cov_ub: 5.0
+      # Ratio of tokens to be applied kl penalty for kl-cov loss
+      kl_cov_ratio: 0.0002
+      # KL divergence penalty coefficient
+      ppo_kl_coef: 0.1
+    checkpoint:
+      # What to include in saved checkpoints
+      # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+      save_contents: ['model', 'optimizer', 'extra']
+      # For more flexibility, you can specify the contents to load from the checkpoint.
+      load_contents: ${actor_rollout_ref.actor.checkpoint.save_contents}
+    optim:
+      lr: 1e-6
+      lr_warmup_steps: -1 # Prioritized. Negative values mean delegating to lr_warmup_steps_ratio.
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      min_lr_ratio: 0.0   # only used with cosine lr scheduler, default to 0.0
+      num_cycles: 0.5     # only used with cosine lr scheduler, default to 0.5
+      warmup_style: constant  # select from constant/cosine
+      total_training_steps: -1  # must be override by program
+      weight_decay: 0.0
+    fsdp_config:
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      param_offload: False
+      optimizer_offload: False
+      offload_policy: False # only for fsdp2, offload param\grad\optimizer during train
+      reshard_after_forward: True # only for fsdp2, [True, False, int between 1 and fsdp_size]
+      fsdp_size: -1
+      # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
+      # before the current forward computation.
+      forward_prefetch: False
+    # profiler configs
+    profiler:
+      # True for each task has its own database, False for all tasks in one training step share one database.
+      discrete: False
+      # Whether to profile all ranks.
+      all_ranks: False
+      # The ranks that will be profiled. null or [0,1,...]
+      ranks: null
+  ref:
+    # actor_rollout_ref.ref: FSDP config same as actor. For models larger than 7B, it’s recommended to turn on offload for ref by default
+    strategy: ${actor_rollout_ref.actor.strategy}
+    include_ref: False
+    fsdp_config:
+      param_offload: False
+      reshard_after_forward: True # only for fsdp2, [True, False, int between 1 and fsdp_size]
+      # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
+      # before the current forward computation.
+      forward_prefetch: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+    use_torch_compile: ${actor_rollout_ref.actor.use_torch_compile}
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+    # calculate entropy with chunking to reduce memory peak
+    entropy_from_logits_with_chunking: False
+    # recompute entropy
+    entropy_checkpointing: False
+    # profiler configs
+    profiler:
+      # True for each task has its own database, False for all tasks in one training step share one database.
+      discrete: False
+      # Whether to profile all ranks.
+      all_ranks: False
+      # The ranks that will be profiled. null or [0,1,...]
+      ranks: null
+  rollout:
+    name: vllm
+    mode: sync # sync: LLM, async: AsyncLLM
+    chat_scheduler: null
+    max_model_len: null
+    temperature: 1.0
+    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+    top_p: 1
+    use_fire_sampling: False
+    prompt_length: ${data.max_prompt_length}  # not use for opensource
+    response_length: ${data.max_response_length}
+    # for vllm rollout
+    dtype: bfloat16 # should align with FSDP
+    gpu_memory_utilization: 0.5
+    ignore_eos: False
+    enforce_eager: True
+    free_cache_engine: True
+    load_format: dummy_dtensor
+    # for huge model, layered summon can save memory (prevent OOM) but make it slower
+    layered_summon: False
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_num_seqs: 1024
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    disable_log_stats: True
+    enable_chunked_prefill: True # could get higher throughput
+    # for hf rollout
+    do_sample: True
+    n: 1 # > 1 for grpo
+    multi_stage_wake_up: false
+    # Extra inference engine arguments (vllm, sglang).
+    engine_kwargs:
+      # for vllm
+      vllm:
+        # Swap space (in GB) used by inference engine. null uses default (e.g., 4 GB).
+        swap_space: null
+        # Whether to disable the preprocessor cache for multimodel models.
+        disable_mm_preprocessor_cache: False
+      # for sglang
+      sglang:
+        # The attention backend for sglang engine. Options: flashinfer, triton, flashmla, null for default.
+        attention_backend: null
+    val_kwargs:
+      # sampling parameters for validation
+      top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+      top_p: 1.0
+      temperature: 0
+      n: 1
+      do_sample: False # default eager for validation
+    # number of responses (i.e. num sample times)
+    multi_turn:
+      enable: False  # should set rollout.name to sglang_async if True
+      max_turns: null  # null for no limit (default max_length // 3)
+      tool_config_path: null  # null for no tool
+      format: chatml  # chatml, more formats will be supported in the future
+    # support logging rollout prob for debugging purpose
+    calculate_log_probs: False
+    # profiler configs
+    profiler:
+      # True for each task has its own database, False for all tasks in one training step share one database.
+      discrete: False
+      # Whether to profile all ranks.
+      all_ranks: False
+      # The ranks that will be profiled. null or [0,1,...]
+      ranks: null
+    # [Experimental] agent loop based rollout configs
+    agent:
+      # Number of agent loop workers
+      num_workers: 8
+critic:
+  # Number of rollouts per update (mirrors actor rollout_n)
+  rollout_n: ${actor_rollout_ref.rollout.n}
+  # fsdp or fsdp2 strategy used for critic model training
+  strategy: ${actor_rollout_ref.actor.strategy}
+  optim:
+    lr: 1e-5
+    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+    min_lr_ratio: null   # only useful for warmup with cosine
+    warmup_style: constant  # select from constant/cosine
+    total_training_steps: -1  # must be override by program
+    weight_decay: 0.01
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    use_shm: False
+    tokenizer_path: ${actor_rollout_ref.model.path}
+    override_config: { }
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
+    fsdp_config:
+      param_offload: False
+      grad_offload: False
+      optimizer_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      # Only for FSDP2: offload param/grad/optimizer during train
+      offload_policy: False
+      # Only for FSDP2: Reshard after forward pass to reduce memory footprint
+      reshard_after_forward: True
+      # Number of GPUs in each FSDP shard group; -1 means auto
+      fsdp_size: -1
+      # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
+      # before the current forward computation.
+      forward_prefetch: False
+  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+  ppo_micro_batch_size_per_gpu: null
+  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
+  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
+  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+  ulysses_sequence_parallel_size: 1 # sp size
+  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+  shuffle: ${actor_rollout_ref.actor.shuffle}
+  grad_clip: 1.0
+  cliprange_value: 0.5
+reward_model:
+  enable: False
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    use_remove_padding: False
+    fsdp_config:
+      min_num_params: 0
+      param_offload: False
+      fsdp_size: -1
+  micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
+  micro_batch_size_per_gpu: null # set a number
+  max_length: null
+  ulysses_sequence_parallel_size: 1 # sp size
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  # Cloud/local sandbox fusion configuration for custom reward logic
+  sandbox_fusion:
+    # Cloud/local function URL for sandbox execution
+    url: null
+    # Max concurrent requests allowed to sandbox
+    max_concurrent: 64
+    # Max memory limit for each sandbox process in MB
+    memory_limit_mb: 1024
+  # profiler configs
+  profiler:
+    # True for each task has its own database, False for all tasks in one training step share one database.
+    discrete: False
+    # Whether to profile all ranks.
+    all_ranks: False
+    # The ranks that will be profiled. null or [0,1,...]
+    ranks: null
+algorithm:
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: gae
+  norm_adv_by_std_in_grpo: True
+  use_kl_in_reward: False
+  kl_penalty: kl  # how to estimate kl divergence
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.0
+    horizon: 10000
+    target_kl: 0.0
+  # Whether to enable preference feedback PPO
+  use_pf_ppo: False
+  # Preference feedback PPO settings
+  pf_ppo:
+    # Method for reweighting samples: "pow", "max_min", or "max_random"
+    reweight_method: pow
+    # Power used for weight scaling in "pow" method
+    weight_pow: 2.0
+ray_init:
+  num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
+trainer:
+  balance_batch: True
+  debug: False
+  debug_port: 5678
+  wandb_run_id: null
+  total_epochs: 30
+  # The steps that will be profiled. null means no profiling. null or [1,2,5,...]
+  profile_steps: null
+  total_training_steps: null
+  # controller Nvidia Nsight Systems Options. Must set when profile_steps is not None.
+  ## reference https://docs.nvidia.com/nsight-systems/UserGuide/index.html
+  ## reference https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html
+  controller_nsight_options:
+    # Select the API(s) to be traced.
+    trace: "cuda,nvtx,cublas,ucx"
+    # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
+    cuda-memory-usage: "true"
+    # CUDA graphs will be traced as a whole
+    cuda-graph-trace: "graph"
+  # worker Nvidia Nsight Systems Options. Must set when profile_steps is not None.
+  worker_nsight_options:
+    # Select the API(s) to be traced.
+    trace: "cuda,nvtx,cublas,ucx"
+    # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
+    cuda-memory-usage: "true"
+    # CUDA graphs will be traced as a whole
+    cuda-graph-trace: "graph"
+    # Profiling only in a range of torch.cuda.profiler.start and stop. Do not change this config.
+    capture-range: "cudaProfilerApi"
+    # Specify the desired behavior when a capture range ends.
+    # In verl we need the orch.cuda.profiler.start/stop pair to repeats n times.
+    # valid values are "repeat-shutdown:n" or null.
+    # For normal whole step profiling, n = len(profile_steps);
+    # but for discrete profiling, n = len(profile_steps) * Number(subtasks).
+    # Or you can just leave it null and the program will use n = len(profile_steps) * 6;
+    capture-range-end: null
+    # Send signal to the target application's process group. We let the program to exit by itself.
+    kill: none
+  project_name: verl_examples
+  experiment_name: gsm8k
+  logger: [ 'console', 'wandb' ]
+  # Number of generations to log during validation
+  log_val_generations: 0
+  # Directory for logging rollout data; no dump if null
+  rollout_data_dir: null
+  # Directory for logging validation data; no dump if null
+  validation_data_dir: null
+  # Number of nodes used in the training
+  nnodes: 1
+  n_gpus_per_node: 8
+  save_freq: -1
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or auto or resume_path if
+  resume_from_path: False
+  # ESI redundant time (in seconds) for model checkpointsAdd commentMore actions
+  esi_redundant_time: 0
+  test_freq: -1
+  critic_warmup: 0
+  default_hdfs_dir: null
+  default_local_dir: checkpoints/code_io/${trainer.project_name}/${trainer.experiment_name}
+  remove_previous_ckpt_in_save: False
+  del_local_ckpt_after_load: False
+  wandb_tags: null
+  # Maximum number of actor checkpoints to keep
+  max_actor_ckpt_to_keep: null
+  # Maximum number of critic checkpoints to keep
+  max_critic_ckpt_to_keep: null
+  # Timeout (in seconds) for Ray worker to wait for registration
+  ray_wait_register_center_timeout: 300
+  # Device to run training on (e.g., "cuda", "cpu")
+  device: cuda
+reward_fn:
+  extraction_type: answer_addition
+  math_metric: deepscaler #[math_verify|deepscaler|union]
+  splitter: "Assistant:"
+  boxed_retry: False
+azr:
+  seed: 1
+  executor_max_workers: 1
+  executor_cleanup_frequency: 1
+  problem_types:
+    - code_i
+    - code_o
+    - code_f
+  pred_data_mix_strategy: "max_new"  # [uniform_total, max_new, half_new, step]
+  gen_data_probabilities_strategy: "uniform"  # [uniform, step]
+  past_epoch_window: ${azr.data_selection_strategy.update_iteration}
+  seed_dataset: null
+  error_seed_dataset: null
+  output_seed_path: null
+  output_error_seed_path: null
+  output_code_f_seed_path: null
+  code_f_seed_dataset: null
+  pretrain_pred_steps: -1
+  executor: qwq # [qwq, sandboxfusion]
+  ast_check: True
+  execute_max_timeout: 10 # seconds
+  random_print_max_programs: 3
+  train_propose: True
+  use_china_mirror: True # used for sandboxfusion executor for people in China
+  # Data saving options
+  save_generated_data: True   # Enable/disable saving generated data
+  save_data_path: "./generated_programs"  # Path to save generated data (if null, don't save)
+  save_valid_data: True       # Save valid programs
+  save_invalid_data: True     # Save invalid programs
+  save_frequency: 1           # Save every N steps (1 = every step)
+  save_final_datasets: False  # Save complete datasets at training end
+  data_selection_strategy:
+    io_n: 6
+    update_iteration: 1
+    data_len: null # dummy set
+    seed_batch_factor: 4
+    content_max_length: 8096
+    valid_program_filter: all # [all (all valids), non_one (all valids except 100% accuracy), non_extremes (all valids except 0% and 100% accuracy)]
+    max_programs: null
+    batched_estimate: False
+    composite_function_n_min: -1
+    composite_function_n_max: -1
+    composite_chance: 0.5
+    composite_start_step: -1
+    max_programs_initial: ${azr.data_selection_strategy.composite_function_n_max}
+    composite_chance_initial: ${azr.data_selection_strategy.composite_chance}
+    composite_scheduler:
+      enabled: False
+      update_num_programs_start: 101
+      update_num_programs_interval: 50
+      num_programs_max: 3
+      update_probability_start: 101
+      update_probability_interval: 50
+      update_probability_max: 0.8
+      update_probability_increment: 0.01
+    num_inputs: 10 # for code_f, how many inputs to generate
+    banned_words:
+      - logging
+      - random
+      - multiprocessing
+      - pebble
+      - subprocess
+      - threading
+      - datetime
+      - time
+      - hashlib
+      - hmac
+      - bcrypt
+      - os.sys
+      - os.path
+      - sys.exit
+      - os.environ
+      - calendar
+      - datetime
+    banned_keywords_for_errors_and_exceptions:
+      # - raise
+      # - assert
+      # - try
+      # - except
+  reward:
+    n_samples: 8
+    extract_code_block: True
+    code_f_reward_type: binary # [accuracy, binary]
+    generation_reward_config:
+      format_reward: True
+      reject_multiple_functions: True
+      reject_test_input_in_code: False
+      f_replace_location: not_first # [not_first, any_last, any_first, not_last]
+      intrinsic_combine_method: sum # [sum, multiply, sum_multiply]
+      remove_after_return: False # remove global variables
+      remove_comments: False
+      remove_print: False
+      use_original_code_as_ref: False
+      generation_accuracy_convertion: one_minus
+      remove_input_from_snippet: False # prompting
+      include_references: True # ablation for unconditional generation
+      code_location: first # [first, last]
+      complexity_reward:
+        enabled: False
+        coef: 0.0
+        max: 0.5
+      mean_edit_distance_reward:
+        enabled: False
+        coef: 0.0
+        max: 0.5
+      halstead_reward:
+        enabled: False
+        coef: 0.0
+        max: 0.5
+      answer_diversity_reward:
+        enabled: False
+        coef: 0.0
+        max: 0.5
+        hierarchical: False
+      f_input_answer_diversity_reward:
+        enabled: False
+        coef: 0.0
+        max: 0.5
+        hierarchical: False
+      f_output_answer_diversity_reward:
+        enabled: False
+        coef: 0.0
+        max: 0.5
+        hierarchical: False

absolute_zero_reasoner/data_construction/__init__.py ADDED Viewed

File without changes

absolute_zero_reasoner/data_construction/constructor.py ADDED Viewed

	@@ -0,0 +1,225 @@

+from typing import List, Dict
+from numpy import random
+import pandas as pd
+from transformers import AutoTokenizer
+from absolute_zero_reasoner.data_construction.prompts import get_code_problem_generator_prompt, get_code_problem_predictor_prompt
+from absolute_zero_reasoner.data_construction.process_data import boxed_instruction, instruction_following
+from absolute_zero_reasoner.utils.code_utils.parsers import replace_main_function_name
+def get_gen_code_io_data(
+    io_data: List[Dict],
+    target_data_len: int,
+    problem_type: str,
+    instruction_type: str,
+    content_max_length: int,
+    io_n: int,
+    output_path: str,
+    split: str,
+    tokenizer: AutoTokenizer,
+    banned_keywords: List[str],
+    banned_assertion_keywords: List[str],
+    weights: List[float] = None,
+    enable_composite_function: bool = False,
+    composite_function_n_min: int = -1,
+    composite_function_n_max: int = -1,
+    composite_chance: float = 0.5,
+    remove_after_return: bool = False,
+    num_inputs: int = 10,
+    remove_input_from_snippet: bool = False,
+    include_references: bool = True,
+):
+    return_io_data = []
+    if instruction_type.startswith('boxed'):
+        instruction_template = boxed_instruction
+    elif instruction_type.startswith('answer'):
+        instruction_template = instruction_following
+    elif instruction_type.startswith('none'):
+        instruction_template = '{}'
+    else:
+        raise ValueError(f"Invalid instruction type: {instruction_type}")
+    if weights is None:
+        probabilities = [1.0 / len(io_data)] * len(io_data)
+    else:
+        # Normalize weights to form a probability distribution
+        probabilities = [float(w)/sum(weights) for w in weights]
+    idx = 0
+    while len(return_io_data) < target_data_len:
+        if not include_references and problem_type != 'code_f':
+            chosen_references = []
+        else:
+            chosen_references = random.choice(io_data, size=min(io_n, len(io_data)), replace=False, p=probabilities)
+        # composite functions is not used for code_f problem type
+        if problem_type != 'code_f' and composite_function_n_max > 0 and enable_composite_function and random.random() <= composite_chance and len(chosen_references) > composite_function_n_max:
+            # TODO: we only allow composite to sample from code snippets without composite functions
+            io_without_composite_function_indices = [i for i in range(len(io_data)) if not io_data[i]['composite_functions']]
+            io_without_composite_function_data = [io_data[i] for i in io_without_composite_function_indices]
+            io_without_composite_function_weights = [probabilities[i] for i in io_without_composite_function_indices]
+            # normalize the weights
+            io_without_composite_function_probabilities = [w / sum(io_without_composite_function_weights) for w in io_without_composite_function_weights]
+            # number of composite functions to sample is either fixed or random
+            composite_function_n = composite_function_n_min if composite_function_n_min == composite_function_n_max else random.randint(composite_function_n_min, composite_function_n_max)
+            composite_functions = random.choice(io_without_composite_function_data, size=composite_function_n, replace=False, p=io_without_composite_function_probabilities)
+            for i, composite_function in enumerate(composite_functions):
+                # TODO: need to also replace recursively called composite functions, ignore functions that have f as the last letter, only for function call f()
+                composite_functions[i]['snippet'] = replace_main_function_name(composite_function['snippet'], 'f', f'g_{i}')
+            imports = []
+        else:
+            composite_functions = []
+            if include_references:
+                imports = chosen_references[0]['imports']
+            else:
+                imports = []
+        io_prompt = instruction_template.format(
+            get_code_problem_generator_prompt(
+                problem_type=problem_type,
+                reference_snippets=chosen_references,
+                banned_keywords=banned_keywords,
+                banned_assertion_keywords=banned_assertion_keywords,
+                composite_functions=composite_functions,
+                remove_after_return=remove_after_return,
+                num_inputs=num_inputs,
+                remove_input_from_snippet=remove_input_from_snippet,
+            )
+        )
+        if len(tokenizer(io_prompt)['input_ids']) <= content_max_length:
+            io_item = {
+                "data_source": 'gen_' + problem_type,
+                "prompt": [{
+                    "role": "user",
+                    "content": io_prompt,
+                }],
+                "problem": '',
+                "ability": "code",
+                "reward_model": {
+                    "style": "rule",
+                    "ground_truth": '',
+                },
+                "extra_info": {
+                    'split': split,
+                    'index': idx,
+                    'metric': 'gen_' + problem_type,
+                    'chosen_references': chosen_references,
+                    'composite_functions': composite_functions,
+                    'imports': imports,
+                }
+            }
+            return_io_data.append(io_item)
+            idx += 1
+        if len(return_io_data) >= target_data_len:
+            break
+    # if io_data is not full, we sample upsample random data
+    while len(return_io_data) < target_data_len:
+        io_item = io_data[random.randint(0, len(io_data))]
+        return_io_data.append(io_item)
+    # output to parquet
+    df = pd.DataFrame(return_io_data)
+    df.to_parquet(output_path)
+def get_pred_code_io_data(
+    io_data: List[Dict],
+    target_data_len: int,
+    problem_type: str,
+    instruction_type: str,
+    content_max_length: int,
+    output_path: str,
+    split: str,
+    tokenizer: AutoTokenizer,
+):
+    return_io_data = []
+    if instruction_type.startswith('boxed'):
+        instruction_template = boxed_instruction
+    elif instruction_type.startswith('answer'):
+        instruction_template = instruction_following
+    elif instruction_type.startswith('none'):
+        instruction_template = '{}'
+    else:
+        raise ValueError(f"Invalid instruction type: {instruction_type}")
+    for idx, io_item in enumerate(io_data):
+        if problem_type == 'code_i':
+            ground_truth = io_item['input']
+        elif problem_type == 'code_o':
+            ground_truth = io_item['output']
+        elif problem_type == 'code_e':
+            ground_truth = io_item['output']
+        elif problem_type == 'code_f':
+            ground_truth = io_item['snippet']
+        else:
+            raise ValueError(f"Invalid problem type: {problem_type}")
+        if problem_type == 'code_f':
+            num_given_inputs = len(io_item['inputs']) // 2
+            num_given_outputs = len(io_item['outputs']) // 2
+            given_inputs = list(io_item['inputs'][:num_given_inputs])
+            given_outputs = list(io_item['outputs'][:num_given_outputs])
+            hidden_inputs = list(io_item['inputs'][num_given_inputs:])
+            hidden_outputs = list(io_item['outputs'][num_given_outputs:])
+            io_prompt = instruction_template.format(
+                get_code_problem_predictor_prompt(
+                    problem_type=problem_type,
+                    snippet=io_item['snippet'],
+                    message=io_item['message'],
+                    input_output_pairs=zip(given_inputs, given_outputs),
+                )
+            )
+        else:
+            io_prompt = instruction_template.format(
+                get_code_problem_predictor_prompt(
+                    problem_type=problem_type,
+                    snippet=io_item['snippet'],
+                    input_args=io_item['input'],
+                    output=io_item['output'],
+                )
+            )
+        if len(tokenizer(io_prompt)['input_ids']) <= content_max_length:
+            output_io_item = {
+                "data_source": 'pred_' + problem_type,
+                "prompt": [{
+                    "role": "user",
+                    "content": io_prompt,
+                }],
+                "problem": io_item['snippet'],
+                "ability": "code",
+                "reward_model": {
+                    "style": "rule",
+                    "ground_truth": ground_truth,
+                },
+                "extra_info": {
+                    'split': split,
+                    'index': idx,
+                    'metric': 'pred_' + problem_type,
+                    'imports': io_item['imports'],
+                }
+            }
+            if problem_type == 'code_f': # for code_f, we need to split the inputs and outputs into given and hidden, only show part of the inputs and outputs to the model
+                output_io_item['extra_info']['given_inputs'] = given_inputs
+                output_io_item['extra_info']['given_outputs'] = given_outputs
+                output_io_item['extra_info']['hidden_inputs'] = hidden_inputs
+                output_io_item['extra_info']['hidden_outputs'] = hidden_outputs
+                output_io_item['extra_info']['message'] = io_item['message']
+            else:
+                output_io_item['extra_info']['input'] = io_item['input']
+                output_io_item['extra_info']['output'] = io_item['output']
+            return_io_data.append(output_io_item)
+        if len(return_io_data) >= target_data_len:
+            break
+    # if io_data is not full, we sample upsample random data
+    while len(return_io_data) < target_data_len:
+        io_item = return_io_data[random.randint(0, len(return_io_data))]
+        return_io_data.append(io_item)
+    # output to parquet
+    df = pd.DataFrame(return_io_data)
+    df.to_parquet(output_path)

absolute_zero_reasoner/data_construction/process_code_reasoning_data.py ADDED Viewed

	@@ -0,0 +1,175 @@

+from pathlib import Path
+import argparse
+import re
+from datasets import load_dataset
+from tqdm import tqdm
+import pandas as pd
+from absolute_zero_reasoner.rewards.code_reward import format_python_code
+from absolute_zero_reasoner.data_construction.prompts import get_code_problem_predictor_prompt
+from absolute_zero_reasoner.data_construction.process_data import instruction_following
+def process_livecodebench_execution(row):
+    # Extract all function names from the code
+    program_name_matches = re.findall(r'def\s+(\w+)\s*\(', row['problem'])
+    if not program_name_matches:
+        raise ValueError("Could not find any function names in code")
+    # Extract the function name from the input
+    input_match = re.search(r'(\w+)\(', row['input'])
+    if not input_match:
+        raise ValueError("Could not find function name in input")
+    input_function_name = input_match.group(1)
+    # Check if the function name from input appears in any of the defined functions
+    if input_function_name not in program_name_matches:
+        raise ValueError(f"Function '{input_function_name}' from input not found in code. Available functions: {program_name_matches}")
+    # Use the function name from input for replacement
+    program_name = input_function_name
+    # Replace the program name with `f` in the code
+    row['problem'] = re.sub(r'def\s+' + re.escape(program_name) + r'\s*\(', 'def f(', row['problem'])
+    # Process the input: remove the function name and keep only the parameters
+    row['input'] = re.sub(r'^\w+\s*\(|\)$', '', row['input']).strip()
+    return row
+def add_imports(problem):
+    # Add necessary imports based on the content of the problem
+    if 'collections' in problem:
+        problem = 'import collections\n' + problem
+    if 'Counter' in problem:
+        problem = 'from collections import Counter\n' + problem
+    if 'gcd' in problem:
+        problem = 'from math import gcd\n' + problem
+    if 'deque' in problem:
+        problem = 'from collections import deque\n' + problem
+    if '@cache' in problem:
+        problem = 'from functools import cache\n' + problem
+    if '= inf' in problem or '[inf]' in problem or 'inf)' in problem:
+        problem = 'from math import inf\n' + problem
+    if 'accumulate' in problem:
+        problem = 'from itertools import accumulate\n' + problem
+    if '@lru_cache' in problem:
+        problem = 'from functools import lru_cache\n' + problem
+    if 'defaultdict' in problem:
+        problem = 'from collections import defaultdict\n' + problem
+    if 'bisect' in problem:
+        problem = 'import bisect\n' + problem
+    if 'islice' in problem:
+        problem = 'from itertools import islice\n' + problem
+    if 'math.inf' in problem:
+        problem = 'import math\n' + problem
+    if 'prod(' in problem:
+        problem = 'from math import prod\n' + problem
+    if 'heapify(' in problem:
+        problem = 'from heapq import heapify, heappop, heappush\n' + problem
+    if 'reduce(' in problem:
+        problem = 'from functools import reduce\n' + problem
+    if 'comb(' in problem:
+        problem = 'from math import comb\n' + problem
+    problem = problem.replace('List', 'list').replace('Dict', 'dict').replace('Tuple', 'tuple').replace('Set', 'set')
+    problem = problem.replace('from typing import list', 'from typing import List')
+    return problem
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--max_length', type=int, default=-1)
+    args = parser.parse_args()
+    # 283, 452, 510
+    ds = load_dataset('cruxeval-org/cruxeval')['test']
+    ds = ds.map(lambda x: {'problem': format_python_code(x['code'])})
+    output_data = []
+    for i, data in enumerate(tqdm(ds, desc="Processing CruxEval")):
+        prompt = get_code_problem_predictor_prompt('code_i', data['problem'], data['input'], data['output'])
+        formatted_question = instruction_following.format(prompt)
+        output_data.append({
+            "data_source": 'cruxeval_i',
+            "prompt": [{
+                "role": "user",
+                "content": formatted_question
+            }],
+            "problem": data['problem'],
+            "ability": "math",
+            "reward_model": {
+                "style": "rule",
+                "ground_truth": data['output']
+            },
+            "extra_info": {
+                'split': 'test',
+                'index': i,
+                'metric': 'pred_code_i',
+                'problem_type': 'code_i',
+                'input': data['input'],
+                'output': data['output'],
+            }
+        })
+        prompt = get_code_problem_predictor_prompt('code_o', data['problem'], data['input'], data['output'])
+        formatted_question = instruction_following.format(prompt)
+        output_data.append({
+            "data_source": 'cruxeval_o',
+            "prompt": [{
+                "role": "user",
+                "content": formatted_question
+            }],
+            "problem": data['problem'],
+            "ability": "math",
+            "reward_model": {
+                "style": "rule",
+                "ground_truth": data['output']
+            },
+            "extra_info": {
+                'split': 'test',
+                'index': i + len(data),
+                'metric': 'pred_code_o',
+                'problem_type': 'code_o',
+                'input': data['input'],
+                'output': data['output'],
+            }
+        })
+    # another ds:
+    ds = load_dataset('livecodebench/execution')['test']
+    ds = ds.map(lambda x: {'problem': format_python_code(x['code'])})
+    ds = ds.remove_columns(['code'])
+    ds = ds.map(process_livecodebench_execution)
+    # normalize the code
+    ds = ds.map(lambda x: {'problem': add_imports(x['problem'])})
+    for i, data in enumerate(tqdm(ds, desc="Processing LiveCodeBench")):
+        prompt = get_code_problem_predictor_prompt('code_i', data['problem'], data['input'], data['output'])
+        formatted_question = instruction_following.format(prompt)
+        output_data.append({
+            "data_source": 'livecodebench',
+            "prompt": [{
+                "role": "user",
+                "content": formatted_question
+            }],
+            "problem": data['problem'],
+            "ability": "math",
+            "reward_model": {
+                "style": "rule",
+                "ground_truth": data['output']
+            },
+            "extra_info": {
+                'split': 'test',
+                'index': i + len(data),
+                'metric': 'pred_code_i',
+                'problem_type': 'code_i',
+                'input': data['input'],
+                'output': data['output'],
+            }
+        })
+    df = pd.DataFrame(output_data)
+    if args.max_length > 0:
+        df = df.iloc[:args.max_length]
+    path = Path('data/code_reason')
+    path.mkdir(parents=True, exist_ok=True)
+    df.to_parquet(path / f'test_answer{"_" + str(args.max_length) if args.max_length > 0 else ""}.parquet')

absolute_zero_reasoner/data_construction/process_data.py ADDED Viewed

	@@ -0,0 +1,210 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess the GSM8k dataset to parquet format
+"""
+import os
+import datasets
+from glob import glob
+import argparse
+from verl.utils.hdfs_io import copy, makedirs
+from verl.utils.reward_score.math import remove_boxed, last_boxed_only_string
+def extract_solution(solution_str):
+    return remove_boxed(last_boxed_only_string(solution_str))
+METRIC_MAP = {
+        'aime2024': 'math',
+        'aime2025': 'math',
+        'gpqa': 'mc',
+        'amc2023': 'math',
+        'math500': 'math',
+        'minerva': 'math',
+        'olympiadbench': 'math',
+        'math': 'math',
+        'orz': 'math',
+        'simplerl': 'math',
+        'hmmt_2025': 'math',
+        'hmmt_2024': 'math',
+        'live_math_bench': 'math',
+        'big_math': 'math',
+        'deepscaler': 'math',
+        "math3to5": 'math',
+        'dapo': 'math',
+    }
+instruction_following = "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. User: {}\nAssistant: <think>"
+boxed_instruction = "{}\nPlease reason step by step, and put your final answer within \\boxed{{}}."
+# add a row to each data item that represents a unique id
+def make_map_fn(split, question_key, answer_key, do_extract_solution, reward_fn_extraction_type, nothink = False):
+    def process_fn(example, idx):
+        question = example.pop(question_key)
+        if reward_fn_extraction_type == 'answer':
+            formatted_question = (instruction_following if not nothink else instruction_following.strip(' <think>')).format(question)
+        elif reward_fn_extraction_type == 'boxed':
+            formatted_question = boxed_instruction.format(question)
+        elif reward_fn_extraction_type == 'none':
+            formatted_question = question
+        # gpqa has this string in the question
+        if reward_fn_extraction_type != 'boxed':
+            remove_string = "\n\nPlease reason step-by-step and put your choice letter without any other text with \\boxed{} in the end."
+            replacement_string = '\n\nPlease reason step-by-step and put your choice letter without any other text with <answer> </answer> in the end.'
+            formatted_question = formatted_question.replace(remove_string, replacement_string)
+        answer = example.pop(answer_key)
+        if do_extract_solution:
+            solution = extract_solution(answer)
+        else:
+            solution = answer
+        data_source = example.pop('data_source')
+        data = {
+            "data_source": data_source,
+            "prompt": [{
+                "role": "user",
+                "content": formatted_question
+            }],
+            "problem": question,
+            "ability": "math",
+            "reward_model": {
+                "style": "rule",
+                "ground_truth": solution
+            },
+            "extra_info": {
+                'split': split,
+                'index': idx,
+                'metric': METRIC_MAP[data_source],
+            }
+        }
+        return data
+    return process_fn
+def process_data(args):
+    # 'lighteval/MATH' is no longer available on huggingface.
+    # Use mirror repo: DigitalLearningGmbH/MATH-lighteval
+    if args.train_set == 'math':
+        dataset = datasets.load_dataset('DigitalLearningGmbH/MATH-lighteval', trust_remote_code=True)
+    elif args.train_set == 'orz':
+        dataset = datasets.load_dataset('json', data_files='data/orz_math_57k_collected.json')
+        dataset = dataset.map(lambda x: {'problem': x['0']['value'], 'solution': x['1']['ground_truth']['value']})
+    elif args.train_set == 'simplerl':
+        dataset = datasets.load_dataset('json', data_files='data/math_level3to5_data_processed_with_qwen_prompt.json')
+        dataset = dataset.map(lambda x: {'problem': x['input'].replace('<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n<|im_start|>user\n', '').replace('<|im_end|>\n<|im_start|>assistant', ''), 'solution': x['gt_answer']})
+    elif args.train_set == 'big_math':
+        dataset = datasets.load_dataset('SynthLabsAI/Big-Math-RL-Verified')
+        dataset = dataset.rename_column('answer', 'solution')
+    elif args.train_set == 'deepscaler':
+        dataset = datasets.load_dataset('agentica-org/DeepScaleR-Preview-Dataset')
+        dataset = dataset.remove_columns(['solution'])
+        dataset = dataset.rename_column('answer', 'solution')
+    elif args.train_set == 'dapo':
+        remove_string = "Solve the following math problem step by step. The last line of your response should be of the form Answer: $Answer (without quotes) where $Answer is the answer to the problem.\n\n"
+        remove_string_2 = "\n\nRemember to put your answer on its own line after \"Answer:\"."
+        dataset = datasets.load_dataset('YouJiacheng/DAPO-Math-17k-dedup')
+        dataset = dataset.map(lambda x: {'problem': x['prompt'][0]['content'].replace(remove_string, '').replace(remove_string_2, '').strip(), 'solution': x['reward_model']['ground_truth']})
+    else:
+        raise ValueError(f"Invalid train_set: {args.train_set}")
+    if not args.test_only:
+        train_dataset = dataset['train']
+        train_dataset = train_dataset.add_column('data_source', [args.train_set] * len(train_dataset))
+        if args.filter_key is not None and args.filter_value is not None:
+            train_dataset = train_dataset.filter(lambda x: x[args.filter_key] == args.filter_value)
+        train_dataset = train_dataset.remove_columns([k for k in train_dataset.column_names if k not in ['problem', 'solution', 'data_source']])
+    test_datasources = glob('data/*.jsonl')
+    test_datasets = []
+    for test_datasource in test_datasources:
+        if 'seed_io' in test_datasource or 'MbppPlus' in test_datasource or 'HumanEvalPlus' in test_datasource:
+            continue
+        temp_ds = datasets.load_dataset('json', data_files=test_datasource, split='train')
+        if 'question' in temp_ds.column_names and 'problem' not in temp_ds.column_names:
+            temp_ds = temp_ds.rename_column('question', 'problem')
+        temp_ds = temp_ds.remove_columns([col for col in temp_ds.column_names if col not in ['problem', 'answer']])
+        temp_ds = temp_ds.add_column('data_source', [test_datasource.split('/')[-1].split('.')[0]] * len(temp_ds))
+        temp_ds = temp_ds.cast_column('problem', datasets.Value('string'))
+        temp_ds = temp_ds.cast_column('answer', datasets.Value('string'))
+        temp_ds = temp_ds.cast_column('data_source', datasets.Value('string'))
+        test_datasets.append(temp_ds)
+    live_math_bench_datasets = ['v202412_AMC_en', 'v202412_CCEE_en', 'v202412_CNMO_en', 'v202412_WLPMC_en', 'v202412_hard_en']
+    for dataset_name in live_math_bench_datasets:
+        live_math_bench_ds = datasets.load_dataset('opencompass/LiveMathBench', dataset_name)['test']
+        live_math_bench_ds = live_math_bench_ds.rename_column('question', 'problem')
+        live_math_bench_ds = live_math_bench_ds.remove_columns([col for col in live_math_bench_ds.column_names if col not in ['problem', 'answer']])
+        live_math_bench_ds = live_math_bench_ds.add_column('data_source', ['live_math_bench'] * len(live_math_bench_ds))
+        test_datasets.append(live_math_bench_ds)
+    test_dataset = datasets.concatenate_datasets(test_datasets)
+    if not args.test_only:
+        train_dataset = train_dataset.map(
+            function=make_map_fn(args.train_split_key, 'problem', 'solution', args.train_set == 'math', args.reward_fn_extraction_type, args.nothink),
+            with_indices=True, num_proc=16,
+        )
+    test_dataset = test_dataset.map(
+        function=make_map_fn(args.eval_split_key, 'problem', 'answer', False, args.reward_fn_extraction_type, args.nothink),
+        with_indices=True, num_proc=16,
+    )
+    if args.length_limit != -1 and not args.test_only:
+        train_dataset = train_dataset.select(range(args.length_limit))
+        test_dataset = test_dataset.select(range(args.length_limit))
+    local_dir = args.local_dir + f'/{args.train_set}{"_nothink" if args.nothink else ""}'
+    hdfs_dir = args.hdfs_dir
+    if args.filter_key is not None:
+        filter_key = f"_{args.filter_key}_{args.filter_value}"
+    else:
+        filter_key = ""
+    if not args.test_only:
+        train_dataset.to_parquet(os.path.join(local_dir, f'train_{args.reward_fn_extraction_type}{"" if args.length_limit == -1 else f"_{args.length_limit}"}{filter_key}.parquet'))
+    test_dataset.to_parquet(os.path.join(local_dir, f'test_{args.reward_fn_extraction_type}{"_ood" if args.ood_testsets else ""}{"" if args.length_limit == -1 else f"_{args.length_limit}"}{filter_key}.parquet'))
+    if hdfs_dir is not None:
+        makedirs(hdfs_dir)
+        copy(src=local_dir, dst=hdfs_dir)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--local_dir', default='data')
+    parser.add_argument(
+        '--reward_fn_extraction_type',
+        default='answer',
+        choices=['answer', 'boxed', 'none']
+    )
+    parser.add_argument('--length_limit', default=-1, type=int)
+    parser.add_argument('--hdfs_dir', default=None)
+    parser.add_argument('--train_set', default='math', choices=['math', 'orz', 'simplerl', 'big_math', 'deepscaler', 'dapo'])
+    parser.add_argument('--test_only', default=False, action='store_true')
+    parser.add_argument('--train_split_key', default='train', type=str)
+    parser.add_argument('--eval_split_key', default='test', type=str)
+    parser.add_argument('--filter_key', default=None, type=str)
+    parser.add_argument('--filter_value', default=None, type=str)
+    parser.add_argument('--nothink', default=False, action='store_true')
+    args = parser.parse_args()
+    print(args)
+    process_data(args)

absolute_zero_reasoner/data_construction/prompts.py ADDED Viewed

	@@ -0,0 +1,546 @@

+from typing import List, Dict, Tuple
+code_input_prompt = """
+## Task: Create a Python Code Snippet (where custom classes are allowed, which should be defined at the top of the code snippet) with one Matching Input
+Using the reference code snippets provided below as examples, design a new and unique Python code snippet that demands deep algorithmic reasoning to deduce one possible input from a given output. Your submission should include both a code snippet and test input pair, where the input will be plugged into the code snippet to produce the output, which that function output be given to a test subject to come up with any input that will produce the same function output. This is meant to be an I.Q. test.
+### Code Requirements:
+- Name the entry function `f` (e.g., `def f(...): ...`), you can have nested definitions inside `f`
+- Ensure the function returns a value
+- Include at least one input parameter
+- Make the function deterministic
+- Make the snippet require state tracking across multiple data transformations, ensuring the task requires long multi step reasoning
+- AVOID THE FOLLOWING:
+  * Random functions or variables
+  * Date/time operations
+  * I/O operations (reading files, network requests)
+  * Printing or logging
+  * Any external state
+- Ensure execution completes within 10 seconds on a modern CPU
+- All imports and class definitions should be at the very top of the code snippet
+- The snippet should end with a return statement from the main function `f`, anything after will be removed
+{remove_input_from_snippet_prompt}{remove_after_return_prompt}
+### Input Requirements:
+- Provide exactly one test input for your function
+- Format multiple arguments with commas between them
+- Remember to add quotes around string arguments
+### Formatting:
+- Format your code with: ```python
+  def f(...):
+      # your code here
+      return ...
+  ```
+- Format your input with: ```input
+  arg1, arg2, ...
+  ```
+### Example Format:
+```python
+def f(name: str, info: dict):
+    # code logic here
+    return result
+```
+```input
+'John', {{'age': 20, 'city': 'New York'}}
+```
+### Evaluation Criteria:
+- Executability, your code should be executable given your input
+- Difficulty in predicting the output from your provided input and code snippet. Focus on either algorithmic reasoning or logic complexity. For example, you can define complex data structure classes and operate on them like trees, heaps, stacks, queues, graphs, etc, or use complex control flow, dynamic programming, recursions, divide and conquer, greedy, backtracking, etc
+- Creativity, the code needs to be sufficiently different from the provided reference snippets
+- Restricted usage of certain keywords and packages, you are not allowed to use the following words in any form, even in comments: <|BANNED_KEYWORDS|>
+First, carefully devise a clear plan: e.g., identify how your snippet will be challenging, distinct from reference snippets, and creative. Then, write the final code snippet and its inputs.
+### Reference Code Snippets:
+"""
+code_output_prompt = """
+## Task: Create a New Python Code Snippet (where custom classes are allowed, which should be defined at the top of the code snippet) with one Matching Input
+Using the reference code snippets provided below as examples, design a new and unique Python code snippet that demands deep algorithmic reasoning to deduce the output from the input. Your submission should include a code snippet and a test input pair, where the input will be plugged into the code snippet to produce the output. The input will be given to a test subject to deduce the output, which is meant to be an I.Q. test.
+### Code Requirements:
+- Name the entry function `f` (e.g., `def f(...): ...`), you can have nested definitions inside `f`
+- Ensure the function returns a value
+- Include at least one input parameter
+- Make the function deterministic
+- Make the snippet require state tracking across multiple data transformations, ensuring the task requires long multi step reasoning
+- AVOID THE FOLLOWING:
+  * Random functions or variables
+  * Date/time operations
+  * I/O operations (reading files, network requests)
+  * Printing or logging
+  * Any external state
+- Ensure execution completes within 10 seconds on a modern CPU
+- All imports and class definitions should be at the very top of the code snippet
+- The snippet should end with a return statement from the main function `f`, anything after will be removed
+{remove_input_from_snippet_prompt}{remove_after_return_prompt}
+### Input Requirements:
+- Provide exactly one test input for your function
+- Format multiple arguments with commas between them
+- Remember to add quotes around string arguments
+### Formatting:
+- Format your code with:
+```python
+def f(...):
+    # your code here
+    return ...
+```
+- Format your input with:
+```input
+arg1, arg2, ...
+```
+### Example Format:
+```python
+def f(name: str, info: dict):
+    # code logic here
+    return result
+```
+```input
+'John', {{'age': 20, 'city': 'New York'}}
+```
+### Evaluation Criteria:
+- Executability, your code should be executable given your input
+- Difficulty in predicting your ```input``` from 1) your ```python``` code and 2) the deterministic ```output``` that will be obtained from your ```input```. Focus on either algorithmic reasoning or logic complexity. For example, you can define complex data structure classes and operate on them like trees, heaps, stacks, queues, graphs, etc, or use complex control flow, dynamic programming, recursions, divide and conquer, greedy, backtracking, etc
+- Creativity, the code needs to be sufficiently different from the provided reference snippets
+- Restricted usage of certain keywords and packages, you are not allowed to use the following words in any form, even in comments: <|BANNED_KEYWORDS|>
+First, carefully devise a clear plan: e.g., identify how your snippet will be challenging, distinct from reference snippets, and creative. Then, write the final code snippet and its inputs.
+### Reference Code Snippets:
+"""
+code_error_prompt = """
+## Task: Create a New Python Code Snippet (where custom classes are allowed, which should be defined at the top of the code snippet) with one Matching Input
+Using the reference code snippets provided below as examples, design a new and unique Python code snippet that demands deep algorithmic reasoning to deduce what type of error will be raised when the code is executed. Your submission should include a code snippet and a test input pair, where the input will be plugged into the code snippet to produce the error. You can also choose to include a custom error type in your code snippet. However, the code can also be designed to raise no error. The input and the code will be given to a test subject to deduce the error type, which is meant to be an I.Q. test.
+### Code Requirements:
+- Name the entry function `f` (e.g., `def f(...): ...`), you can have nested definitions inside `f`
+- Ensure the function returns a value
+- Include at least one input parameter
+- Make the function deterministic
+- Make the snippet require state tracking across multiple data transformations, ensuring the task requires long multi step reasoning
+- AVOID THE FOLLOWING:
+  * Random functions or variables
+  * Date/time operations
+  * I/O operations (reading files, network requests)
+  * Printing or logging
+  * Any external state
+- Ensure execution completes within 10 seconds on a modern CPU
+- All imports and class definitions should be at the very top of the code snippet
+- The snippet should end with a return statement from the main function `f`, anything after will be removed
+{remove_after_return_prompt}
+### Input Requirements:
+- Provide exactly one test input for your function
+- Format multiple arguments with commas between them
+- Remember to add quotes around string arguments
+### Formatting:
+- Format your code with:
+```python
+def f(...):
+    # your code here
+    return ...
+```
+- Format your input with:
+```input
+arg1, arg2, ...
+```
+### Example Format:
+```python
+def f(name: str, info: dict):
+    # code logic here
+    return result
+```
+```input
+'John', {{'age': 20, 'city': 'New York'}}
+```
+### Evaluation Criteria:
+- Executability, your code should be executable given your input
+- Difficulty in deducing the error type (or no error) from 1) your ```python``` code and ```input```. Focus on either algorithmic reasoning or logic complexity. For example, you can define complex data structure classes and operate on them like trees, heaps, stacks, queues, graphs, etc, or use complex control flow, dynamic programming, recursions, divide and conquer, greedy, backtracking, etc
+- Creativity, the code needs to be sufficiently different from the provided reference snippets
+- Restricted usage of certain keywords and packages, you are not allowed to use the following words in any form, even in comments: <|BANNED_KEYWORDS|>
+<|BANNED_ASSERTION_KEYWORDS|>
+First, carefully devise a clear plan: e.g., identify how your snippet will be challenging, distinct from reference snippets, and creative. Then, write the final code snippet and its inputs. The code needs to compile and pass AST checks, but it is intended to raise an error or not.
+### Reference Code Snippets:
+"""
+code_function_prompt = """
+## Task: Output {num_inputs} Inputs that can be plugged into the following Code Snippet to produce diverse Outputs, and give a message related to the given snippet.
+Using the code snippet provided below, design {num_inputs} inputs that can be plugged into the code snippet to produce a diverse set of outputs. A subset of your given input and its deterministically produced outputs will be given to a test subject to deduce the function, which is meant to be an I.Q. test. You can also leave a message to the test subject to help them deduce the code snippet.
+### Input Requirements:
+- Provide {num_inputs} valid inputs for the code snippet
+- For each input, format multiple arguments with commas between them
+- Remember to add quotes around string arguments
+- Each input should be individually wrapped in ```input``` tags
+### Message Requirements:
+- Leave a message to the test subject to help them deduce the code snippet
+- The message should be wrapped in ```message``` tags
+- The message can be in any form, can even be formed into a coding question, or a natural language instruction what the code snippet does
+- You cannot provide the code snippet in the message
+### Formatting:
+- Format your input with:
+```input
+arg1, arg2, ...
+```
+### Example Format:
+```input
+'John', {{'age': 20, 'city': 'New York'}}
+```
+```input
+'Sammy', {{'age': 37, 'city': 'Los Angeles'}}
+```
+### Evaluation Criteria:
+- Executability, your code should be executable given your inputs
+- Coverage, the inputs and outputs should cover the whole input space of the code snippet, able to deduce the code snippet from the inputs and outputs
+- Creativity, the inputs need to be sufficiently different from each other
+- The overall selection of inputs and message combined should be challenging for the test subject, but not impossible for them to solve
+First, carefully devise a clear plan: e.g., understand the code snippet, then identify how your proposed inputs have high coverage, and why the inputs will be challenging and creative. Then, write the inputs and message. Remember to wrap your inputs in ```input``` tags, and your message in ```message``` tags.
+### Code Snippet:
+```python
+{snippet}
+```
+"""
+# code_input_predictor_prompt = """
+# # Task: Provide One Possible Input of a Python Code Snippet Given the Code and Output
+# Given the following Code Snippet and the Output, think step by step then provide one possible input that produced the output. The input needs to be wrapped in ```input``` tags. Remember if an argument is a string, wrap it in quotes. If the function requires multiple arguments, separate them with commas.
+# # Code Snippet:
+# ```python
+# {snippet}
+# ```
+# # Output:
+# ```output
+# {output}
+# ```
+# # Output Format:
+# ```input
+# arg1, arg2, ...
+# ```
+# # Example Output:
+# ```input
+# 'John', {{'age': 20, 'city': 'New York'}}
+# ```
+# """
+# code_output_predictor_prompt = """
+# # Task: Deduce the Output of a Python Code Snippet Given the Code and Input
+# Given the following Code Snippet and the Input, think step by step then deduce the output that will be produced from plugging the Input into the Code Snippet. Put your output in ```output``` tags. Remember if the output is a string, wrap it in quotes. If the function returns multiple values, remember to use a tuple to wrap them.
+# # Code Snippet:
+# ```python
+# {snippet}
+# ```
+# # Input:
+# ```input
+# {input_args}
+# ```
+# # Example Output:
+# ```output
+# {{'age': 20, 'city': 'New York'}}
+# ```
+# """
+code_error_predictor_prompt = """
+# Task: Deduce the Error Type of a Python Code Snippet Given the Code and Input
+Given the following Code Snippet and the Input, think step by step to deduce the error type that will be raised when the code is executed. Put your final output in ```output``` tags. If there are no errors, put "NoError" in the ```output``` tags.
+# Code Snippet:
+```python
+{snippet}
+```
+# Input:
+```input
+{input_args}
+```
+# Example Output:
+```output
+ValueError
+```
+"""
+# code_suffix = "\nf(<|YOUR INPUT WILL BE PLUGGED HERE|>)"
+# code_function_predictor_prompt = """
+# # Task: Deduce the Function that Produced the Outputs from the Inputs
+# Given a set of input/output pairs and a message that describes the function, think through the problem step by step to deduce a general code snippet. This code should produce the hidden outputs from the hidden inputs, matching the original data-generating code that created the input/output pairs. Place your final answer inside python tags! It may be helpful to work through each input/output pair individually to test your function. If your function doesn’t work as expected, revise it until it does. The final code snippet will be used to evaluate your response, which is wrapped in ```python``` tags.
+# # Code Requirements:
+# - Name the entry function `f` (e.g., `def f(...): ...`), you can have nested definitions inside `f`
+# - Ensure the function returns a value
+# - Include at least one input parameter
+# - Make the function deterministic
+# - AVOID THE FOLLOWING:
+#   * Random functions or variables
+#   * Date/time operations
+#   * I/O operations (reading files, network requests)
+#   * Printing or logging
+#   * Any external state
+# - Ensure execution completes within 10 seconds on a modern CPU
+# - All imports and class definitions should be at the very top of the code snippet
+# - The snippet should end with a return statement from the main function `f()`, anything after will be removed
+# # Input and Output Pairs:
+# {input_output_pairs}
+# # Message:
+# ```message
+# {message}
+# ```
+# # Example Output:
+# ```python
+# def f(a):
+#     return a
+# ```
+# Name your entry function `f()`!!!
+# """
+#################################
+#        Changed Prompt         #
+#################################
+code_input_predictor_prompt =  """
+A conversation between User and Assistant.
+The User provides a Python code snippet and its observed output. The Assistant must:
+1. **Privately think step-by-step** about which input produces that output.
+2. **Output exactly one** `<think>...</think>` block containing your full reasoning.
+3. **Then output exactly one** `<answer>...</answer>` block containing **only** the input values—no labels, no comments, no extra text.
+4. **Do not** generate any text outside these two blocks.
+5. Adhere to the **input rules**.
+# Input Rules:
+- If an argument is a string, wrap it in quotes.
+- For multiple arguments, separate by commas.
+- Use Python literal notation for lists, dicts, tuples.
+- Boolean values must be `True` or `False`.
+User:
+# Python Code Snippet:
+{snippet}
+# Observed Output:
+{output}
+# Assitant should follow this format:
+# Example Response format:
+<think>
+# 1. Analyze the function signature.
+# 2. Walk through the code to see how the observed output arises.
+# 3. Identify specific input values that yield that output.
+</think>
+<answer>
+<your input here>
+</answer>
+Assistant:
+"""
+code_output_predictor_prompt = """
+A conversation between User and Assistant.
+The User provides a Python code snippet and specific input values. The Assistant must:
+1. **Privately think step-by-step** about how the code executes with the given inputs.
+2. **Output exactly one** `<think>...</think>` block containing your full reasoning.
+3. **Then output exactly one** `<answer>...</answer>` block containing **only** the output values—no labels, no comments, no extra text.
+4. **Do not** generate any text outside these two blocks.
+5. Adhere to the **output rules**.
+# Output Rules:
+- If the output is a string, wrap it in quotes.
+- For dicts, lists, and other literals, use valid Python literal notation.
+User:
+# Python Code Snippet:
+{snippet}
+# Input:
+{input_args}
+# Assitant should follow this format:
+<think>
+# 1. Examine the code and input.
+# 2. Walk through execution step by step.
+# 3. Determine the exact output produced.
+</think>
+<answer>
+<your output here>
+</answer>
+Assistant:
+"""
+code_suffix = "\nf(<|YOUR INPUT WILL BE PLUGGED HERE|>)"
+code_function_predictor_prompt = """
+A conversation between User and Assistant.
+The User provides a set of input/output pairs and a message describing the hidden function. The Assistant must:
+1. **Privately think step-by-step** about how to reconstruct the general function based on the provided examples.
+2. **Output exactly one** `<think>...</think>` block containing the full reasoning process.
+3. **Then output exactly one** `<answer>...</answer>` block containing **only** the Python code snippet defining the function `f`—no labels, no comments, no extra text.
+4. **Do not** generate any text outside these two blocks.
+5. Follow to the **code requirements** and **formatting rules**.
+# Code Requirements:
+- Name the entry function `f` (e.g., `def f(...): ...`), you may include nested definitions inside `f`.
+- Ensure the function returns a value.
+- Include at least one input parameter.
+- Make the function deterministic.
+- AVOID the FOLLOWING:
+  * Random functions or variables
+  * Date/time operations
+  * I/O operations (reading files, network requests)
+  * Printing or logging
+  * Any external state
+- Ensure execution completes within 10 seconds on a modern CPU.
+- All imports and custom class definitions must be at the very top of the code snippet.
+- The snippet must end with a return statement from the main function `f`; anything after will be removed.
+User:
+# Input and Output Pairs:
+{input_output_pairs}
+# Message:
+{message}
+# Assistant should follow this format:
+<think>
+# 1. Review each input/output pair and the message to understand the pattern.
+# 2. Infer the general algorithm or transformation being applied.
+# 3. Outline the structure of function `f` that would reproduce all examples.
+# 4. Ensure the function meets all requirements.
+</think>
+<answer>
+def f(...):
+    # your code here
+    return ...
+</answer>
+Assistant:
+"""
+# composite_requirements_prompt = "\n[IMPORTANT CRITERIA!!!] The main function `f` MUST make calls to ALL these functions {function_names} in its body, and you SHOULD NOT provide the definition of {function_names} in your output code snippet. You should first reason step by step about what these functions, {function_names}, do, then write the code snippet.\n" + '\n### The Functions that Must ALL be Called in your Code Snippet: \n```python\n{composite_functions}\n```\n'
+composite_requirements_prompt = "\n[IMPORTANT CRITERIA!!!] The main function `f` MUST make calls to ALL these functions {function_names} in its body, and you SHOULD NOT provide the definition of {function_names} in your output code snippet. The function `f` should build on top of {function_names} with extra functionalities, not just a simple wrapper. You should first reason step by step about what these functions, {function_names}, do, then write the code snippet.\n" + '\n### The Functions that Must ALL be Called in your Code Snippet: \n```python\n{composite_functions}\n```\n'
+remove_input_from_snippet_prompt = "- Do not have the test input anywhere in the code snippet, provide it in the input section."
+remove_singleton_variables_prompt = "- All variable declarations must be inside the main function `f` or within functions `f` make calls to. Any variables declared outside of functions will be removed.\n"
+def get_code_problem_generator_prompt(
+    problem_type: str,
+    reference_snippets: List[Dict[str, str]],
+    banned_keywords: List[str],
+    banned_assertion_keywords: List[str],
+    composite_functions: List[str] = None,
+    remove_after_return: bool = False,
+    num_inputs: int = 10,
+    remove_input_from_snippet: bool = False,
+) -> str:
+    # assert not (remove_after_return and not remove_input_from_snippet)
+    composite_functions = list(composite_functions)
+    snippet_string = ""
+    if problem_type != 'code_f':
+        output_key = 'output' if problem_type != 'code_e' else 'error'
+        for i, snippet in enumerate(reference_snippets):
+            snippet_string += f"<snippet_{i}>\n```python\n{snippet['snippet']}\n```\n```input\n{snippet['input']}\n```\n```{output_key}\n{snippet['output']}\n```\n</snippet_{i}>\n"
+    if problem_type == "code_i":
+        return code_input_prompt.format(
+            remove_after_return_prompt=(remove_singleton_variables_prompt if remove_after_return else '\n'),
+            remove_input_from_snippet_prompt=(remove_input_from_snippet_prompt if remove_input_from_snippet else '')
+        ).replace(
+            '<|BANNED_KEYWORDS|>', ', '.join(banned_keywords)
+        ) + snippet_string + (
+            composite_requirements_prompt.format(
+                function_names=', '.join([f'`g_{i}`' for i in range(len(composite_functions))]),
+                composite_functions="\n".join([d['snippet'] for d in composite_functions])
+            ) if composite_functions else '\n'
+        )
+    elif problem_type == "code_o":
+        return code_output_prompt.format(
+            remove_after_return_prompt=(remove_singleton_variables_prompt if remove_after_return else '\n'),
+            remove_input_from_snippet_prompt=(remove_input_from_snippet_prompt if remove_input_from_snippet else '')
+        ).replace(
+            '<|BANNED_KEYWORDS|>', ', '.join(banned_keywords)
+        ) + snippet_string + (
+            composite_requirements_prompt.format(
+                function_names=', '.join([f'`g_{i}`' for i in range(len(composite_functions))]),
+                composite_functions="\n".join([d['snippet'] for d in composite_functions])
+            ) if composite_functions else '\n'
+        )
+    elif problem_type == "code_f":
+        return code_function_prompt.format(
+            num_inputs=num_inputs,
+            snippet=reference_snippets[0]['snippet'] + code_suffix,
+        )
+    elif problem_type == "code_e":
+        if banned_assertion_keywords:
+            assertion_keywords_string = '- The following error handling keywords are not allowed to be used in the code snippet: ' + ', '.join(banned_assertion_keywords) + '\n'
+        else:
+            assertion_keywords_string = '\n'
+        return code_error_prompt.format(
+            remove_after_return_prompt=(remove_singleton_variables_prompt if remove_after_return else '\n'),
+        ).replace(
+            '<|BANNED_KEYWORDS|>', ', '.join(banned_keywords)
+        ).replace(
+            '<|BANNED_ASSERTION_KEYWORDS|>', assertion_keywords_string
+        ) + snippet_string + (
+            composite_requirements_prompt.format(
+                function_names=', '.join([f'`g_{i}`' for i in range(len(composite_functions))]),
+                composite_functions="\n".join([d['snippet'] for d in composite_functions])
+            ) if composite_functions else '\n'
+        )
+    else:
+        raise ValueError(f"Invalid problem type: {problem_type}")
+def get_code_problem_predictor_prompt(problem_type: str, snippet: str, input_args: str = None, output: str = None, message: str = None, input_output_pairs: List[Tuple[str, str]] = None) -> str:
+    if problem_type.endswith("code_i"):
+        return code_input_predictor_prompt.format(snippet=snippet, output=output)
+    elif problem_type.endswith("code_o"):
+        return code_output_predictor_prompt.format(snippet=snippet, input_args=input_args)
+    elif problem_type.endswith("code_f"):
+        input_output_pairs_string = ""
+        for i, (input, output) in enumerate(input_output_pairs):
+            input_output_pairs_string += f"```input_{i}\n{input}\n```\n```output_{i}\n{output}\n```\n"
+        return code_function_predictor_prompt.format(input_output_pairs=input_output_pairs_string, message=message)
+    elif problem_type.endswith("code_e"):
+        return code_error_predictor_prompt.format(snippet=snippet, input_args=input_args)
+    else:
+        raise ValueError(f"Invalid problem type: {problem_type}")

absolute_zero_reasoner/main_azr_ppo.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Note that we don't combine the main with ray_trainer as ray_trainer is used by other main.
+"""
+import ray
+import hydra
+from pathlib import Path
+from pprint import pprint
+from omegaconf import OmegaConf
+from verl.utils.fs import copy_local_path_from_hdfs
+from verl.utils import hf_tokenizer
+from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+from absolute_zero_reasoner.trainer.ppo.azr_ray_trainer import CodeIORayPPOTrainer
+from absolute_zero_reasoner.rewards.reward_managers import CodeIORewardManager
+@hydra.main(config_path='configs', config_name='azr_ppo_trainer', version_base=None)
+def main(config):
+    run_ppo(config)
+# Define a function to run the PPO-like training process
+def run_ppo(config) -> None:
+    # Check if Ray is not initialized
+    if not ray.is_initialized():
+        # Initialize Ray with a local cluster configuration
+        # Set environment variables in the runtime environment to control tokenizer parallelism,
+        # NCCL debug level, VLLM logging level, and allow runtime LoRA updating
+        # `num_cpus` specifies the number of CPU cores Ray can use, obtained from the configuration
+        import os
+        cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
+        ray.init(
+            runtime_env={"env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "WARN",
+                "VLLM_ALLOW_RUNTIME_LORA_UPDATING": "true",
+                "CUDA_VISIBLE_DEVICES": cuda_visible_devices
+            }},
+            num_cpus=config.ray_init.num_cpus,
+        )
+    # Create a remote instance of the TaskRunner class, and
+    # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete
+    if OmegaConf.select(config.trainer, "profile_steps") is not None and len(OmegaConf.select(config.trainer, "profile_steps")) > 0:
+        nsight_options = OmegaConf.to_container(config.trainer.controller_nsight_options)
+        runner = TaskRunner.options(runtime_env={
+            "nsight": nsight_options,
+            "env_vars": {"CUDA_VISIBLE_DEVICES": cuda_visible_devices}
+        }).remote()
+    else:
+        runner = TaskRunner.options(runtime_env={
+            "env_vars": {"CUDA_VISIBLE_DEVICES": cuda_visible_devices}
+        }).remote()
+    ray.get(runner.run.remote(config))
+    # [Optional] get the path of the timeline trace file from the configuration, default to None
+    # This file is used for performance analysis
+    timeline_json_file = config.ray_init.get("timeline_json_file", None)
+    if timeline_json_file:
+        ray.timeline(filename=timeline_json_file)
+@ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
+class TaskRunner:
+    def run(self, config):
+        pprint(OmegaConf.to_container(config, resolve=True))  # resolve=True will eval symbol values
+        OmegaConf.resolve(config)
+        if config.trainer.debug:
+            import debugpy
+            debugpy.listen(("0.0.0.0", config.trainer.debug_port))
+            print(f"Debugger listening on port {config.trainer.debug_port}")
+            debugpy.wait_for_client()
+            print("Debugger attached!")
+        # generator one batch, solver one batch
+        config.actor_rollout_ref.actor.ppo_mini_batch_size = config.data.train_batch_size * len(config.azr.problem_types) * (2 if config.azr.train_propose else 1)
+        pprint(f"auto setting ppo_mini_batch_size: {config.actor_rollout_ref.actor.ppo_mini_batch_size}")
+        config.azr.data_selection_strategy.data_len = config.data.train_batch_size * config.azr.data_selection_strategy.update_iteration
+        pprint(f"auto setting data_len: {config.azr.data_selection_strategy.data_len}")
+        config.trainer.default_local_dir = (Path(config.trainer.default_local_dir) / config.data.train_files.split('/')[-1].split('.')[0] / config.actor_rollout_ref.model.path.split('/')[-1] / config.reward_fn.extraction_type).as_posix()
+        assert not (not config.azr.reward.generation_reward_config.reject_multiple_functions and config.azr.data_selection_strategy.composite_function_n_min > 0), "If reject_multiple_functions is False, composite_function_n_min must be 0"
+        # download the checkpoint from hdfs
+        local_path = copy_local_path_from_hdfs(config.actor_rollout_ref.model.path)
+        # Instantiate the tokenizer and processor.
+        from verl.utils import hf_processor, hf_tokenizer
+        trust_remote_code = config.data.get("trust_remote_code", False)
+        tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+        # base model chat template
+        if config.actor_rollout_ref.model.pretrained_tokenizer:
+            tokenizer.chat_template = "{%- for message in messages -%}{{- '\n' if not loop.first -}}{{- message['content'] -}}{%- endfor -%}"
+        # Used for multimodal LLM, could be None
+        processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True)
+        # Version validation for vllm.
+        if config.actor_rollout_ref.rollout.name in ["vllm"]:
+            from verl.utils.vllm_utils import is_version_ge
+            if config.actor_rollout_ref.model.get("lora_rank", 0) > 0:
+                if not is_version_ge(pkg="vllm", minver="0.7.3"):
+                    raise NotImplementedError("PPO LoRA is not supported before vllm 0.7.3")
+        # Define worker classes based on the actor strategy.
+        if config.actor_rollout_ref.actor.strategy in ["fsdp", "fsdp2"]:
+            assert config.critic.strategy in ["fsdp", "fsdp2"]
+            from verl.single_controller.ray import RayWorkerGroup
+            from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker
+            actor_rollout_cls = AsyncActorRolloutRefWorker if config.actor_rollout_ref.rollout.mode == "async" else ActorRolloutRefWorker
+            ray_worker_group_cls = RayWorkerGroup
+        elif config.actor_rollout_ref.actor.strategy == "megatron":
+            assert config.actor_rol# lout_ref.actor.strategy == config.critic.strategy
+            from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+            from verl.workers.megatron_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker
+            actor_rollout_cls = AsyncActorRolloutRefWorker if config.actor_rollout_ref.rollout.mode == "async" else ActorRolloutRefWorker
+            ray_worker_group_cls = NVMegatronRayWorkerGroup
+        else:
+            raise NotImplementedError
+        from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+        # Map roles to their corresponding remote worker classes.
+        role_worker_mapping = {
+            Role.ActorRollout: ray.remote(actor_rollout_cls),
+            Role.Critic: ray.remote(CriticWorker),
+        }
+        # Define the resource pool specification.
+        # Map roles to the resource pool.
+        global_pool_id = "global_pool"
+        resource_pool_spec = {
+            global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+        }
+        mapping = {
+            Role.ActorRollout: global_pool_id,
+            Role.Critic: global_pool_id,
+        }
+        # We should adopt a multi-source reward function here:
+        # - for rule-based rm, we directly call a reward score
+        # - for model-based rm, we call a model
+        # - for code related prompt, we send to a sandbox if there are test cases
+        # finally, we combine all the rewards together
+        # The reward type depends on the tag of the data
+        if config.reward_model.enable:
+            if config.reward_model.strategy in ["fsdp", "fsdp2"]:
+                from verl.workers.fsdp_workers import RewardModelWorker
+            elif config.reward_model.strategy == "megatron":
+                from verl.workers.megatron_workers import RewardModelWorker
+            else:
+                raise NotImplementedError
+            role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
+            mapping[Role.RewardModel] = global_pool_id
+        # Add a reference policy worker if KL loss or KL reward is used.
+        if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+            role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
+            mapping[Role.RefPolicy] = global_pool_id
+        reward_fn = CodeIORewardManager(
+            tokenizer=tokenizer,
+            num_examine=0,
+            reward_fn_extraction_type=config.reward_fn.extraction_type,
+            math_metric=config.reward_fn.math_metric,
+            split='train',
+            splitter=config.reward_fn.splitter,
+            output_path=config.trainer.default_local_dir,
+            max_prompt_length=config.data.max_prompt_length,
+            generation_reward_config=config.azr.reward.generation_reward_config,
+            valid_program_filter=config.azr.data_selection_strategy.valid_program_filter,
+            debug=config.trainer.debug,
+            extract_code_block=config.azr.reward.extract_code_block,
+            code_f_reward_type=config.azr.reward.code_f_reward_type,
+            boxed_retry=config.reward_fn.boxed_retry,
+        )
+        # Note that we always use function-based RM for validation
+        val_reward_fn = CodeIORewardManager(
+            tokenizer=tokenizer,
+            num_examine=1,
+            reward_fn_extraction_type=config.reward_fn.extraction_type,
+            math_metric=config.reward_fn.math_metric,
+            split='test',
+            splitter=config.reward_fn.splitter,
+            output_path=config.trainer.default_local_dir,
+            max_prompt_length=config.data.max_prompt_length,
+            generation_reward_config=config.azr.reward.generation_reward_config,
+            valid_program_filter=config.azr.data_selection_strategy.valid_program_filter,
+            debug=config.trainer.debug,
+            extract_code_block=config.azr.reward.extract_code_block,
+            code_f_reward_type=config.azr.reward.code_f_reward_type,
+            boxed_retry=config.reward_fn.boxed_retry,
+        )
+        resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+        wandb_tags = [
+            'codeio', config.azr.pred_data_mix_strategy, 'executor-' + config.azr.executor,
+            config.azr.data_selection_strategy.valid_program_filter, config.azr.gen_data_probabilities_strategy,
+        ]
+        wandb_tags.extend(config.azr.problem_types)
+        if config.trainer.wandb_tags is not None:
+            config.trainer.wandb_tags = wandb_tags + config.trainer.wandb_tags.split(',')
+        else:
+            config.trainer.wandb_tags = wandb_tags
+        trainer = CodeIORayPPOTrainer(
+            past_epoch_window=config.azr.past_epoch_window,
+            config=config,
+            tokenizer=tokenizer,
+            processor=processor,
+            role_worker_mapping=role_worker_mapping,
+            resource_pool_manager=resource_pool_manager,
+            ray_worker_group_cls=ray_worker_group_cls,
+            reward_fn=reward_fn,
+            val_reward_fn=val_reward_fn,
+        )
+        trainer.init_workers()
+        trainer.fit()
+if __name__ == '__main__':
+    try:
+        main()
+    except KeyboardInterrupt:
+        import sys
+        import traceback
+        traceback.print_exc()
+        sys.exit(0)
+    except Exception as e:
+        import os
+        import traceback
+        traceback.print_exc()
+        os._exit(1)

absolute_zero_reasoner/rewards/__init__.py ADDED Viewed

File without changes

absolute_zero_reasoner/rewards/code_reward.py ADDED Viewed

	@@ -0,0 +1,554 @@

+"""
+https://github.com/huggingface/open-r1
+"""
+import re
+import json
+from typing import Dict, Any, List, Tuple
+import ast
+import difflib
+import json
+from complexipy import code_complexity
+import black
+import autopep8
+from absolute_zero_reasoner.utils.code_utils.parsers import (
+    parse_imports,
+    remove_comments_and_docstrings,
+    remove_any_not_definition_imports,
+    remove_print_statements,
+)
+def format_python_code(code: str) -> str:
+    """Formats Python code with proper indentation using autopep8."""
+    try:
+        # First try to use black for formatting
+        formatted = black.format_str(code, mode=black.Mode())
+        return formatted
+    except:
+        # Fallback to a simpler approach that handles the specific test case
+        # Parse the code line by line
+        formatted_lines = []
+        in_function = False
+        function_indent = 0
+        empty_line_after_return = False
+        for line in code.split('\n'):
+            stripped = line.strip()
+            # Skip empty lines but remember them for context
+            if not stripped:
+                if in_function and empty_line_after_return:
+                    # Empty line after return statement likely means end of function
+                    in_function = False
+                formatted_lines.append('')
+                continue
+            # Detect function definition
+            if stripped.startswith('def ') and stripped.endswith(':'):
+                in_function = True
+                function_indent = 0
+                formatted_lines.append(stripped)
+                continue
+            # Handle indentation inside functions
+            if in_function:
+                # Check for return statement
+                if stripped.startswith('return '):
+                    formatted_lines.append('    ' + stripped)
+                    empty_line_after_return = True
+                    continue
+                # Check if this is likely a line outside the function
+                if empty_line_after_return and not stripped.startswith(('    ', '\t')):
+                    in_function = False
+                    formatted_lines.append(stripped)
+                    continue
+                # Regular function body line
+                formatted_lines.append('    ' + stripped)
+            else:
+                # Line outside any function
+                formatted_lines.append(stripped)
+        # Apply autopep8 for final cleanup
+        return autopep8.fix_code(
+            '\n'.join(formatted_lines),
+            options={'aggressive': 1, 'indent_size': 4}
+        )
+def extract_code(completion: str) -> str:
+    pattern = re.compile(r"```python\n(.*?)```", re.DOTALL)
+    matches = pattern.findall(completion)
+    extracted_answer = matches[-1] if len(matches) >= 1 else ""
+    return extracted_answer
+def parse_to_ast(code_snippet: str) -> ast.AST:
+    """
+    Parse a Python code snippet into an Abstract Syntax Tree (AST).
+    Args:
+        code_snippet: A string containing Python code
+    Returns:
+        An AST object representing the code
+    Raises:
+        SyntaxError: If the code snippet contains syntax errors
+    """
+    try:
+        return ast.parse(code_snippet)
+    except SyntaxError as e:
+        print(f"Syntax error in code: {e}")
+        raise
+def ast_to_dict(node: ast.AST) -> Dict[str, Any]:
+    """
+    Convert an AST node to a dictionary representation for easier comparison.
+    Args:
+        node: An AST node
+    Returns:
+        A dictionary representing the node and its children
+    """
+    if isinstance(node, ast.AST):
+        # Extract node type and fields
+        result = {"node_type": node.__class__.__name__}
+        # Add children nodes
+        for field, value in ast.iter_fields(node):
+            if field == "ctx":  # Skip context objects as they vary unnecessarily
+                continue
+            # Handle different types of field values
+            if isinstance(value, list):
+                result[field] = [ast_to_dict(item) for item in value if isinstance(item, ast.AST)]
+            elif isinstance(value, ast.AST):
+                result[field] = ast_to_dict(value)
+            elif value is not None:
+                # Keep primitive values unchanged
+                result[field] = value
+        return result
+    else:
+        return {"value": str(node)}
+def ast_edit_distance(code1: str, code2: str) -> float:
+    """
+    Calculate the edit distance between two Abstract Syntax Trees.
+    Args:
+        ast1: First AST
+        ast2: Second AST
+    Returns:
+        A float value representing the normalized edit distance (0.0 = identical, 1.0 = completely different)
+    """
+    try:
+        ast1 = parse_to_ast(format_python_code(code1))
+        ast2 = parse_to_ast(format_python_code(code2))
+        # Convert ASTs to dictionary representation
+        dict1 = ast_to_dict(ast1)
+        dict2 = ast_to_dict(ast2)
+        # Convert to strings for difflib comparison
+        str1 = json.dumps(dict1, sort_keys=True, indent=2)
+        str2 = json.dumps(dict2, sort_keys=True, indent=2)
+        # Calculate similarity ratio using difflib
+        similarity = difflib.SequenceMatcher(None, str1, str2).ratio()
+        # Convert similarity to distance (1.0 - similarity)
+        distance = 1.0 - similarity
+        return distance
+    except Exception as e:
+        print(f"Error in ast_edit_distance: {e}")
+        return 0.0
+def ast_edit_operations(ast1: ast.AST, ast2: ast.AST) -> List[Dict[str, Any]]:
+    """
+    Generate a list of edit operations needed to transform ast1 into ast2.
+    Args:
+        ast1: First AST
+        ast2: Second AST
+    Returns:
+        A list of edit operations (insert, delete, modify)
+    """
+    # Convert ASTs to dictionary representation
+    dict1 = ast_to_dict(ast1)
+    dict2 = ast_to_dict(ast2)
+    # Convert to strings for difflib comparison
+    str1 = json.dumps(dict1, sort_keys=True, indent=2).splitlines()
+    str2 = json.dumps(dict2, sort_keys=True, indent=2).splitlines()
+    # Calculate differences
+    diff = list(difflib.unified_diff(str1, str2, n=0))
+    # Parse diff into operations
+    operations = []
+    for line in diff[2:]:  # Skip the header lines
+        if line.startswith('+'):
+            operations.append({
+                "operation": "insert",
+                "content": line[1:].strip()
+            })
+        elif line.startswith('-'):
+            operations.append({
+                "operation": "delete",
+                "content": line[1:].strip()
+            })
+        elif line.startswith(' '):
+            # Context line, no operation needed
+            pass
+    return operations
+def get_code_complexity_reward(code_snippet: str) -> float:
+    """
+    Calculate the complexity of a Python code snippet using the `code_complexity` function from the `complexipy` library.
+    Args:
+        code_snippet: A string containing Python code
+    Returns:
+        A float value representing the complexity of the code snippet
+    """
+    try:
+        return code_complexity(format_python_code(code_snippet)).complexity / 15
+    except Exception as e:
+        return 0.0
+def get_halstead_reward(code_snippet: str,
+                        effort_max: float = 10000,
+                        complexity_max: float = 10,
+                        volume_max: float = 500) -> float:
+    """
+    Calculate the Halstead reward for a Python code snippet.
+    Args:
+        code_snippet: A string containing Python code
+    Returns:
+        A float value representing the Halstead reward of the code snippet
+    """
+    try:
+        from radon.metrics import h_visit
+        from radon.complexity import cc_visit
+        code = format_python_code(code_snippet)
+        h = h_visit(code).total
+        effort = h.effort
+        volume = h.volume
+        cc_blocks = cc_visit(code)
+        complexity = max((b.complexity for b in cc_blocks), default=1)
+        effort_norm = min(effort / effort_max, 1.0)
+        complexity_norm = min(complexity / complexity_max, 1.0)
+        volume_norm = min(volume / volume_max, 1.0)
+        w1, w2, w3 = 0.5, 0.3, 0.2
+        score = w1 * effort_norm + w2 * complexity_norm + w3 * volume_norm
+        return round(score, 3)
+    except Exception as e:
+        return 0.0
+def has_test_input(snippet_code: str) -> bool:
+    test_patterns = [
+        r"(?i)#\s*(test|example)",  # Match any test/example comment
+        r"\b(input|test_input|sample_input)\b\s*=",  # Common test variable names
+        r"\b\w*input\w*\s*=\s*",    # Match any variable containing "input"
+        r"\b(expected|output|result)\s*=\s*",
+        r"\bassert\b",
+        r"print\s*\(\s*f\(",
+        r"f\(\[.*\]\)",
+        r"f\([^)]*\)\s*(#|$)",
+        r"^\s*input\s*$",  # Match lines containing only "input"
+    ]
+    return any(
+        re.search(pattern, snippet_code, re.MULTILINE)
+        for pattern in test_patterns
+    )
+def parse_code_input_output(
+    input_str: str,
+    parse_input: bool = True,
+    parse_output: bool = True,
+    remove_after_return: bool = False,
+    remove_comments: bool = False,
+    remove_print: bool = False,
+    reject_multiple_functions: bool = True,
+    reject_test_input_in_code: bool = False,
+    f_replace_location: str = 'not_first',
+    code_location: str = 'first',
+) -> Tuple[bool, Dict[str, str]]:
+    """
+    Parse the input and output of a code snippet.
+    Args:
+        input_str: A string containing the code snippet
+        parse_input: Whether to parse the input
+        parse_output: Whether to parse the output
+    """
+    # Improved regex patterns with better whitespace handling and optional language specifiers
+    code_pattern = r"```(?:python\s*)?\n?(.*?)\n?```"
+    input_pattern = r"```input\s*\n?(.*?)\n?```"
+    output_pattern = r"```output\s*\n?(.*?)\n?```"
+    # Use flags for case-insensitive matching and dotall
+    flags = re.DOTALL | re.IGNORECASE
+    if code_location == 'last':
+        code_matches = list(re.finditer(code_pattern, input_str, flags))
+        if not code_matches:
+            code_match = None
+        else:
+            code_match = code_matches[-1]
+    elif code_location == 'first':
+        code_match = re.search(code_pattern, input_str, flags)
+    else:
+        raise ValueError(f"Invalid code_location: {code_location}. Must be 'first' or 'last'.")
+    # Check required blocks
+    if parse_input:
+        input_match = re.search(input_pattern, input_str, flags)
+        if not input_match:
+            # Try alternative pattern without explicit input block
+            input_match = re.search(r"# Input:\s*(.*?)(?=\n```|$)", input_str, flags)
+    if parse_output:
+        output_match = re.search(output_pattern, input_str, flags)
+        if not output_match:
+            # Try alternative pattern without explicit output block
+            output_match = re.search(r"# Output:\s*(.*?)(?=\n```|$)", input_str, flags)
+    # Validate required components
+    if not code_match or (parse_input and not input_match) or (parse_output and not output_match):
+        return False, {}
+    # Extract and clean components
+    code_snippet = code_match.group(1).strip()
+    input_snippet = input_match.group(1).strip() if parse_input else ""
+    output_snippet = output_match.group(1).strip() if parse_output else ""
+    # Enhanced function detection and validation
+    function_defs = re.findall(r"^\s*def\s+(\w+)\s*\(", code_snippet, re.MULTILINE)
+    if not function_defs:
+        return False, {}
+    if reject_multiple_functions and len(function_defs) > 1:
+        return False, {}  # Reject multiple function definitions
+    if reject_test_input_in_code and has_test_input(code_snippet):
+        return False, {}
+    # Standardize function name to 'f'
+    if f_replace_location == 'not_first':
+        original_name = function_defs[0]
+    elif f_replace_location == 'any_last':
+        original_name = function_defs[-1] if 'f' not in function_defs else 'f'
+    elif f_replace_location == 'any_first':
+        original_name = function_defs[0] if 'f' not in function_defs else 'f'
+    elif f_replace_location == 'not_last':
+        original_name = function_defs[-1]
+    else:
+        raise ValueError(f'Invalid f_replace_location: {f_replace_location}')
+    if original_name != 'f':
+        code_snippet = re.sub(
+            rf"def\s+{re.escape(original_name)}\s*\(",
+            "def f(",
+            code_snippet,
+            count=0
+        )
+        # Replace all calls to the function as well (for recursive functions)
+        code_snippet = re.sub(
+            rf"\b{re.escape(original_name)}\s*\(",
+            "f(",
+            code_snippet
+        )
+    imports: List[str] = parse_imports(code_snippet)
+    # before_remove_comments = code_snippet
+    # remove comments and docstrings
+    if remove_comments:
+        code_snippet = remove_comments_and_docstrings(code_snippet)
+    # remove anything after return
+    if remove_after_return:
+        code_snippet = remove_any_not_definition_imports(code_snippet)
+    # remove print statements
+    if remove_print:
+        code_snippet = remove_print_statements(code_snippet)
+    # if before_remove_comments != code_snippet:
+    #     with open("changed_content.jsonl", "a") as f:
+    #         f.write(json.dumps({"before": before_remove_comments, "after": code_snippet}) + "\n")
+    return True, {"code": code_snippet, "input": input_snippet, "output": output_snippet, "imports": imports}
+def parse_inputs_message(
+    input_str: str,
+    num_inputs: int,
+) -> Tuple[bool, Dict[str, Any]]:
+    """
+    Parse the last num_inputs inputs and message from a string.
+    Args:
+        input_str: A string containing the inputs and message
+        num_inputs: Number of most recent inputs to parse
+    Returns:
+        A tuple of (success, dict) where dict contains:
+        - inputs: List of last num_inputs input strings
+        - message: The message string
+        Returns (False, {}) if there aren't enough inputs or message is missing
+    """
+    # Improved regex patterns with better whitespace handling and optional language specifiers
+    input_pattern = r"```input\s*\n?(.*?)\n?```"
+    message_pattern = r"```message\s*\n?(.*?)\n?```"
+    # Use flags for case-insensitive matching and dotall
+    flags = re.DOTALL | re.IGNORECASE
+    # Check required blocks
+    input_matches = re.finditer(input_pattern, input_str, flags)
+    if not input_matches:
+        # Try alternative pattern without explicit input block
+        input_matches = re.finditer(r"# Input:\s*(.*?)(?=\n```|$)", input_str, flags)
+    # Get all inputs and take the last num_inputs
+    inputs = [match.group(1).strip() for match in input_matches]
+    # Return early if not enough inputs
+    if len(inputs) < num_inputs:
+        return False, {}
+    inputs = inputs[-num_inputs:]  # Take last num_inputs
+    message_match = re.search(message_pattern, input_str, flags)
+    # Try parsing message between <message> </message> tags if previous methods failed
+    if not message_match:
+        message_match = re.search(r"<message>\s*(.*?)\s*</message>", input_str, flags)
+    if not message_match:
+        # Try alternative pattern without explicit message block
+        message_match = re.search(r"# Message:\s*(.*?)(?=\n```|$)", input_str, flags)
+    # Return early if message not found
+    if not message_match:
+        return False, {}
+    # Extract and clean message
+    message = message_match.group(1).strip()
+    return True, {"inputs": inputs, "message": message}
+def parse_code_function(input_str: str) -> Tuple[bool, str]:
+    """
+    Parse the code function from a string.
+    Args:
+        input_str: A string containing the code function
+    """
+    # Improved regex patterns with better whitespace handling and optional language specifiers
+    code_pattern = r"```(?:python\s*)?\n?(.*?)\n?```"
+    flags = re.DOTALL | re.IGNORECASE
+    # find and output the last code block in the input string
+    code_matches = list(re.finditer(code_pattern, input_str, flags))
+    if not code_matches:
+        return False, ''
+    code_snippet = code_matches[-1].group(1).strip()
+    return True, code_snippet
+def valid_code(solution_str: str, executor, banned_words: List[str]) -> Tuple[bool, str]:
+    success, result = parse_code_input_output(solution_str, parse_output=False)
+    if success:
+        try:
+            output, status = executor.apply(result['code'] + f'\nf({result["input"]})')
+            if 'error' in status.lower():
+                return False, None
+            for banned_word in banned_words:
+                if banned_word.lower() in result['code'].lower():
+                    return False, None
+            return True, output
+        except Exception:
+            return False, None
+    return False, None
+def get_type_counts_reward(answer: str, type_counters: Dict[str, Dict[str, int]], hierarchical: bool = False) -> float:
+    """
+    Calculate the type counts reward for a Python code snippet.
+    Args:
+        answer: A string containing the answer
+        type_counters: A dictionary of type counters
+        hierarchical: Whether to use hierarchical type counts
+    """
+    if hierarchical:
+        # we do not flatten we first have a distribution of the types, then we have a distribution of the elements within each type
+        # we want to maximize the suprise of the answer
+        # first, we get the distribution of the types
+        type_distribution = {}
+        for key, value in type_counters.items():
+            type_distribution[key] = sum(value.values())
+        # try to get the type, if failed default it as a string
+        try:
+            answer_type = type(eval(answer)).__name__
+        except:
+            answer_type = 'str'
+        # then, we get the "suprise" of the answer, sum of 1 - probability of answer_type and 1 - probability of the element within the type
+        suprise = 0
+        if answer_type in type_distribution:
+            suprise += 1 - (type_distribution[answer_type] / sum(type_distribution.values()))
+        else:
+            suprise += 1.0
+        if answer_type in type_counters:
+            if answer in type_counters[answer_type]:
+                suprise += 1 - (type_counters[answer_type][answer] / sum(type_counters[answer_type].values()))
+            else:
+                suprise += 1.0
+        else:
+            suprise += 1.0
+        return suprise / 2
+    else:
+        # first flatten the type_counters, use the counts of each element as a categorical distribution, then, we get the "suprise" of the answer
+        # we want to maximize the suprise
+        # first, flatten the type_counters
+        flattened_type_counters = {}
+        for _, value in type_counters.items():
+            for sub_key, sub_value in value.items():
+                flattened_type_counters[sub_key] = sub_value
+        # then, we get the "suprise" of the answer
+        if answer in flattened_type_counters:
+            suprise = 1 - (flattened_type_counters[answer] / sum(flattened_type_counters.values()))
+            return suprise
+        return 1.0

absolute_zero_reasoner/rewards/custom_evaluate.py ADDED Viewed

	@@ -0,0 +1,387 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/utils.py
+import re
+from collections import Counter
+from typing import Tuple, List, Dict
+from math_verify import parse, verify
+from absolute_zero_reasoner.rewards.math_utils import grade_answer_mathd, grade_answer_sympy
+def choice_answer_clean(pred: str):
+    """https://github.com/hkust-nlp/simpleRL-reason/blob/main/eval/grader.py"""
+    pred = pred.strip("\n").rstrip(".").rstrip("/").strip(" ").lstrip(":")
+    # Clean the answer based on the dataset
+    tmp = re.findall(r"\b(A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z)\b", pred.upper())
+    if tmp:
+        pred = tmp
+    else:
+        pred = [pred.strip().strip(".")]
+    pred = pred[-1]
+    # Remove the period at the end, again!
+    pred = pred.rstrip(".").rstrip("/")
+    return pred
+def extract_code(completion: str, language: str = "python") -> str:
+    pattern = re.compile(rf"```{language}\n(.*?)```", re.DOTALL)
+    matches = pattern.findall(completion)
+    extracted_answer = matches[-1] if len(matches) >= 1 else ""
+    return extracted_answer
+def get_gt_reward(solution_str: str, ground_truth: str, extraction_type: str, metric: str, math_metric: str = 'deepscaler', boxed_retry: bool = False) -> float:
+    answer = extract_answer(solution_str, extraction_type, boxed_retry=boxed_retry)
+    if metric == 'mc':
+        mc_answer = choice_answer_clean(answer)
+        if mc_answer == ground_truth:
+            return 1.0
+        if grade_answer_sympy(answer, ground_truth) or grade_answer_mathd(answer, ground_truth):
+            return 1.0
+        return 0.0
+    elif metric == 'math':
+        if math_metric == 'math_verify':
+            gold = parse('\\boxed{' + ground_truth + '}')
+            answer = parse('\\boxed{' + answer + '}')
+            return 1.0 if verify(gold, answer) else 0.0
+        elif math_metric == 'deepscaler':
+            if grade_answer_sympy(answer, ground_truth) or grade_answer_mathd(answer, ground_truth):
+                return 1.0
+            return 0.0
+        elif math_metric == 'union':
+            math_verify_gold = parse('\\boxed{' + ground_truth + '}')
+            math_verify_answer = parse('\\boxed{' + answer + '}')
+            if grade_answer_sympy(answer, ground_truth) or grade_answer_mathd(answer, ground_truth) or verify(math_verify_gold, math_verify_answer):
+                return 1.0
+            return 0.0
+        else:
+            raise ValueError(f"Invalid math metric: {math_metric}")
+    elif metric == 'code_eval':
+        try:
+            answer = eval(answer.strip())
+        except Exception:
+            return 0.0
+        ground_truth = eval(ground_truth.strip())
+        if answer == ground_truth:
+            return 1.0
+        return 0.0
+    else:
+        raise ValueError(f"Invalid metric: {metric}")
+def extract_answer(solution_str: str, extraction_type: str, boxed_retry: bool = False) -> str:
+    if extraction_type.startswith('answer'):
+        if "<answer>" in solution_str:
+            answer = solution_str.split("<answer>")[-1].split("</answer>")[0]
+        else:
+            if boxed_retry:
+                boxed_answer = last_boxed_only_string(solution_str)
+                answer = boxed_answer if boxed_answer is not None else solution_str
+            else:
+                return ''
+        # Strip LaTeX math delimiters and whitespace
+        answer = answer.strip()
+        return answer
+    elif extraction_type.startswith('boxed'):
+        answer = last_boxed_only_string(solution_str)
+        return answer.strip() if answer is not None else ''
+    else:
+        raise ValueError(f"Invalid extraction type: {extraction_type}")
+def extract_thought(solution_str: str) -> str:
+    if "<think>" in solution_str:
+        return solution_str.split("<think>")[-1].split("</think>")[0]
+    else:
+        return solution_str
+def get_format_reward(
+    solution_str: str,
+    extraction_type: str,
+) -> float:
+    if extraction_type.startswith('answer'):
+        pattern = r"(?s)<think>.*?</think>\s*<answer>.*?</answer>"
+        matched = re.match(pattern, solution_str)
+        if matched:
+            return 1.
+        else:
+            return 0.
+    elif extraction_type.startswith('boxed'):
+        if last_boxed_only_string(solution_str) is not None:
+            return 1.
+        else:
+            return 0.
+    else:
+        raise ValueError(f"Invalid extraction type: {extraction_type}")
+def extract_code_content(solution_str):
+    # Check if the string starts with an XML code block
+    xml_pattern = r'^```\s*xml\n(.*?)```'
+    xml_match = re.match(xml_pattern, solution_str, re.DOTALL | re.IGNORECASE)
+    if xml_match:
+        # XML code block found at start
+        return xml_match.group(1).strip()
+    # Check if the string starts with any code block
+    generic_pattern = r'^```\s*\w*\n(.*?)```'
+    generic_match = re.match(generic_pattern, solution_str, re.DOTALL)
+    if generic_match:
+        # Some other code block found at start
+        return generic_match.group(1).strip()
+    # No code block found at start, return the original string
+    return solution_str.strip()
+def get_reward(
+    solution_str: str,
+    ground_truth: str,
+    extra_info: dict,
+    extraction_type: str,
+    splitter: str,
+    math_metric: str = 'deepscaler',
+    boxed_retry: bool = False,
+) -> Tuple[float, Dict[str, float]]:
+    solution_str = solution_str.split(splitter)[1].strip()
+    solution_str = solution_str.strip('\"\'')
+    gt_reward = get_gt_reward(solution_str, ground_truth, extraction_type, extra_info['metric'], math_metric, boxed_retry=boxed_retry)
+    format_reward = get_format_reward(solution_str, extraction_type)
+    if extra_info['split'] == 'train':
+        if extraction_type.startswith('answer') or extraction_type.startswith('boxed'):
+            if extraction_type.endswith('conditional'):
+                # R(answer) =
+                # 1 if correct formatting and correct answer
+                # -0.5 if correct formatting and incorrect answer
+                # -1 if incorrect formatting
+                if not format_reward:
+                    return -1., {'gt': gt_reward, 'format': format_reward}
+                # correct formatting
+                else:
+                    return 1. if gt_reward else -0.5, {'gt': gt_reward, 'format': format_reward}
+            elif extraction_type.endswith('addition'):
+                return (0.5 if format_reward else 0.) + gt_reward, {'gt': gt_reward, 'format': format_reward}
+            elif extraction_type.endswith('multiply'):
+                return format_reward * gt_reward, {'gt': gt_reward, 'format': format_reward}
+            else:
+                raise ValueError(f"Invalid extraction type: {extraction_type}")
+    elif extra_info['split'] == 'test':
+        return gt_reward, {'gt': gt_reward, 'format': format_reward}
+    else:
+        raise ValueError(f"Invalid split: {extra_info['split']}")
+# string normalization from https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/hendrycks_math.py
+def is_equiv(str1: str, str2: str, verbose: bool = False) -> bool:
+    if str1 is None and str2 is None:
+        print("WARNING: Both None")
+        return True
+    if str1 is None or str2 is None:
+        return False
+    try:
+        ss1 = strip_string(str1)
+        ss2 = strip_string(str2)
+        if verbose:
+            print(ss1, ss2)
+        return ss1 == ss2
+    except Exception:
+        return str1 == str2
+def remove_boxed(s: str) -> str:
+    if "\\boxed " in s:
+        left = "\\boxed "
+        assert s[:len(left)] == left
+        return s[len(left):]
+    left = "\\boxed{"
+    assert s[:len(left)] == left
+    assert s[-1] == "}"
+    return s[len(left):-1]
+def last_boxed_only_string(string: str) -> str:
+    idx = string.rfind("\\boxed")
+    if "\\boxed " in string:
+        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+    if right_brace_idx is None:
+        retval = None
+    else:
+        retval = string[idx:right_brace_idx + 1]
+    return retval
+def fix_fracs(string: str) -> str:
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except AssertionError:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+def fix_a_slash_b(string: str) -> str:
+    if len(string.split("/")) != 2:
+        return string
+    a = string.split("/")[0]
+    b = string.split("/")[1]
+    try:
+        a = int(a)
+        b = int(b)
+        assert string == "{}/{}".format(a, b)
+        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+        return new_string
+    except AssertionError:
+        return string
+def remove_right_units(string: str) -> str:
+    # "\\text{ " only ever occurs (at least in the val set) when describing units
+    if "\\text{ " in string:
+        splits = string.split("\\text{ ")
+        assert len(splits) == 2
+        return splits[0]
+    else:
+        return string
+def fix_sqrt(string: str) -> str:
+    if "\\sqrt" not in string:
+        return string
+    splits = string.split("\\sqrt")
+    new_string = splits[0]
+    for split in splits[1:]:
+        if split[0] != "{":
+            a = split[0]
+            new_substr = "\\sqrt{" + a + "}" + split[1:]
+        else:
+            new_substr = "\\sqrt" + split
+        new_string += new_substr
+    return new_string
+def strip_string(string: str) -> str:
+    # linebreaks
+    string = string.replace("\n", "")
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+    # replace \\ with \
+    string = string.replace("\\\\", "\\")
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+    # remove dollar signs
+    string = string.replace("\\$", "")
+    # remove units (on the right)
+    string = remove_right_units(string)
+    # remove percentage
+    string = string.replace("\\%", "")
+    string = string.replace("\%", "")  # noqa: W605
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    if len(string.split("=")) == 2:
+        if len(string.split("=")[0]) <= 2:
+            string = string.split("=")[1]
+    # fix sqrt3 --> sqrt{3}
+    string = fix_sqrt(string)
+    # remove spaces
+    string = string.replace(" ", "")
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = fix_fracs(string)
+    # manually change 0.5 --> \frac{1}{2}
+    if string == "0.5":
+        string = "\\frac{1}{2}"
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = fix_a_slash_b(string)
+    return string

absolute_zero_reasoner/rewards/math_utils.py ADDED Viewed

	@@ -0,0 +1,490 @@

+"""
+https://github.com/agentica-project/deepscaler/blob/main/deepscaler/rewards/math_utils/utils.py
+"""
+import re
+from pylatexenc import latex2text
+import sympy
+from sympy.parsing import sympy_parser
+from typing import Optional
+# Dan Hendrycks' code
+def mathd_normalize_answer(answer: Optional[str]) -> Optional[str]:
+    if answer is None:
+        return None
+    answer = answer.strip()
+    try:
+        # Remove enclosing `\text{}`.
+        m = re.search("^\\\\text\{(?P<text>.+?)\}$", answer)
+        if m is not None:
+            answer = m.group("text").strip()
+        return _strip_string(answer)
+    except:
+        return answer
+def _strip_string(string):
+    def _fix_fracs(string):
+        substrs = string.split("\\frac")
+        new_str = substrs[0]
+        if len(substrs) > 1:
+            substrs = substrs[1:]
+            for substr in substrs:
+                new_str += "\\frac"
+                if substr[0] == "{":
+                    new_str += substr
+                else:
+                    try:
+                        assert len(substr) >= 2
+                    except:
+                        return string
+                    a = substr[0]
+                    b = substr[1]
+                    if b != "{":
+                        if len(substr) > 2:
+                            post_substr = substr[2:]
+                            new_str += "{" + a + "}{" + b + "}" + post_substr
+                        else:
+                            new_str += "{" + a + "}{" + b + "}"
+                    else:
+                        if len(substr) > 2:
+                            post_substr = substr[2:]
+                            new_str += "{" + a + "}" + b + post_substr
+                        else:
+                            new_str += "{" + a + "}" + b
+        string = new_str
+        return string
+    def _fix_a_slash_b(string):
+        if len(string.split("/")) != 2:
+            return string
+        a = string.split("/")[0]
+        b = string.split("/")[1]
+        try:
+            a = int(a)
+            b = int(b)
+            assert string == "{}/{}".format(a, b)
+            new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+            return new_string
+        except:
+            return string
+    def _remove_right_units(string):
+        # "\\text{ " only ever occurs (at least in the val set) when describing units
+        if "\\text{ " in string:
+            splits = string.split("\\text{ ")
+            assert len(splits) == 2
+            return splits[0]
+        else:
+            return string
+    def _fix_sqrt(string):
+        if "\\sqrt" not in string:
+            return string
+        splits = string.split("\\sqrt")
+        new_string = splits[0]
+        for split in splits[1:]:
+            if split[0] != "{":
+                a = split[0]
+                new_substr = "\\sqrt{" + a + "}" + split[1:]
+            else:
+                new_substr = "\\sqrt" + split
+            new_string += new_substr
+        return new_string
+    # linebreaks
+    string = string.replace("\n", "")
+    # print(string)
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+    # print(string)
+    # replace \\ with \
+    string = string.replace("\\\\", "\\")
+    # print(string)
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+    # print(string)
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+    # print(string)
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+    # remove dollar signs
+    string = string.replace("\\$", "")
+    # remove units (on the right)
+    string = _remove_right_units(string)
+    # remove percentage
+    string = string.replace("\\%", "")
+    string = string.replace("\%", "")
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    if len(string.split("=")) == 2:
+        if len(string.split("=")[0]) <= 2:
+            string = string.split("=")[1]
+    # fix sqrt3 --> sqrt{3}
+    string = _fix_sqrt(string)
+    # remove spaces
+    string = string.replace(" ", "")
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = _fix_fracs(string)
+    # manually change 0.5 --> \frac{1}{2}
+    if string == "0.5":
+        string = "\\frac{1}{2}"
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = _fix_a_slash_b(string)
+    return string
+# sympy might hang -- we don't care about trying to be lenient in these cases
+BAD_SUBSTRINGS = ["^{", "^("]
+BAD_REGEXES = ["\^[0-9]+\^", "\^[0-9][0-9]+"]
+TUPLE_CHARS = "()[]"
+def _sympy_parse(expr: str):
+    """Parses an expression with sympy."""
+    py_expr = expr.replace("^", "**")
+    return sympy_parser.parse_expr(
+        py_expr,
+        transformations=(
+            sympy_parser.standard_transformations
+            + (sympy_parser.implicit_multiplication_application,)
+        ),
+    )
+def _parse_latex(expr: str) -> str:
+    """Attempts to parse latex to an expression sympy can read."""
+    expr = expr.replace("\\tfrac", "\\frac")
+    expr = expr.replace("\\dfrac", "\\frac")
+    expr = expr.replace("\\frac", " \\frac")  # Play nice with mixed numbers.
+    expr = latex2text.LatexNodes2Text().latex_to_text(expr)
+    # Replace the specific characters that this parser uses.
+    expr = expr.replace("√", "sqrt")
+    expr = expr.replace("π", "pi")
+    expr = expr.replace("∞", "inf")
+    expr = expr.replace("∪", "U")
+    expr = expr.replace("·", "*")
+    expr = expr.replace("×", "*")
+    return expr.strip()
+def _is_float(num: str) -> bool:
+    try:
+        float(num)
+        return True
+    except ValueError:
+        return False
+def _is_int(x: float) -> bool:
+    try:
+        return abs(x - int(round(x))) <= 1e-7
+    except:
+        return False
+def _is_frac(expr: str) -> bool:
+    return bool(re.search(r"^-?[0-9]+.?/0*[1-9][0-9]*.?$", expr))
+def _str_is_int(x: str) -> bool:
+    try:
+        x = _strip_properly_formatted_commas(x)
+        x = float(x)
+        return abs(x - int(round(x))) <= 1e-7
+    except:
+        return False
+def _str_to_int(x: str) -> bool:
+    x = x.replace(",", "")
+    x = float(x)
+    return int(x)
+def _inject_implicit_mixed_number(step: str):
+    """
+    Automatically make a mixed number evalable
+    e.g. 7 3/4 => 7+3/4
+    """
+    p1 = re.compile("([0-9]) +([0-9])")
+    step = p1.sub("\\1+\\2", step)  ## implicit mults
+    return step
+def _strip_properly_formatted_commas(expr: str):
+    # We want to be careful because we don't want to strip tuple commas
+    p1 = re.compile("(\d)(,)(\d\d\d)($|\D)")
+    while True:
+        next_expr = p1.sub("\\1\\3\\4", expr)
+        if next_expr == expr:
+            break
+        expr = next_expr
+    return next_expr
+def _normalize(expr: str) -> str:
+    """Normalize answer expressions."""
+    if expr is None:
+        return None
+    # Remove enclosing `\text{}`.
+    m = re.search("^\\\\text\{(?P<text>.+?)\}$", expr)
+    if m is not None:
+        expr = m.group("text")
+    expr = expr.replace("\\%", "%")
+    expr = expr.replace("\\$", "$")
+    expr = expr.replace("$", "")
+    expr = expr.replace("%", "")
+    expr = expr.replace(" or ", " , ")
+    expr = expr.replace(" and ", " , ")
+    expr = expr.replace("million", "*10^6")
+    expr = expr.replace("billion", "*10^9")
+    expr = expr.replace("trillion", "*10^12")
+    for unit in [
+        "degree",
+        "cm",
+        "centimeter",
+        "meter",
+        "mile",
+        "second",
+        "minute",
+        "hour",
+        "day",
+        "week",
+        "month",
+        "year",
+        "foot",
+        "feet",
+        "inch",
+        "yard",
+    ]:
+        expr = re.sub(f"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
+    expr = re.sub(f"\^ *\\\\circ", "", expr)
+    if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}":
+        expr = expr[1:-1]
+    expr = re.sub(",\\\\! *", "", expr)
+    if _is_float(expr) and _is_int(float(expr)):
+        expr = str(int(round(float(expr))))
+    if "\\" in expr:
+        try:
+            expr = _parse_latex(expr)
+        except:
+            pass
+    # edge case with mixed numbers and negative signs
+    expr = re.sub("- *", "-", expr)
+    expr = _inject_implicit_mixed_number(expr)
+    expr = expr.replace(" ", "")
+    # if we somehow still have latex braces here, just drop them
+    expr = expr.replace("{", "")
+    expr = expr.replace("}", "")
+    # don't be case sensitive for text answers
+    expr = expr.lower()
+    if _str_is_int(expr):
+        expr = str(_str_to_int(expr))
+    return expr
+def count_unknown_letters_in_expr(expr: str):
+    expr = expr.replace("sqrt", "")
+    expr = expr.replace("frac", "")
+    letters_in_expr = set([x for x in expr if x.isalpha()])
+    return len(letters_in_expr)
+def should_allow_eval(expr: str):
+    # we don't want to try parsing unknown text or functions of more than two variables
+    if count_unknown_letters_in_expr(expr) > 2:
+        return False
+    for bad_string in BAD_SUBSTRINGS:
+        if bad_string in expr:
+            return False
+    for bad_regex in BAD_REGEXES:
+        if re.search(bad_regex, expr) is not None:
+            return False
+    return True
+def are_equal_under_sympy(ground_truth_normalized: str, given_normalized: str):
+    are_equal = False
+    try:
+        expr = f"({ground_truth_normalized})-({given_normalized})"
+        if should_allow_eval(expr):
+            sympy_diff = _sympy_parse(expr)
+            simplified = sympy.simplify(sympy_diff)
+            if simplified == 0:
+                are_equal = True
+    except:
+        pass
+    return are_equal
+def split_tuple(expr: str):
+    """
+    Split the elements in a tuple/interval, while handling well-formatted commas in large numbers
+    """
+    expr = _strip_properly_formatted_commas(expr)
+    if len(expr) == 0:
+        return []
+    if (
+        len(expr) > 2
+        and expr[0] in TUPLE_CHARS
+        and expr[-1] in TUPLE_CHARS
+        and all([ch not in expr[1:-1] for ch in TUPLE_CHARS])
+    ):
+        elems = [elem.strip() for elem in expr[1:-1].split(",")]
+    else:
+        elems = [expr]
+    return elems
+def last_boxed_only_string(string):
+    idx = string.rfind("\\boxed")
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+    if right_brace_idx == None:
+        retval = None
+    else:
+        retval = string[idx:right_brace_idx + 1]
+    return retval
+def remove_boxed(s):
+    left = "\\boxed{"
+    try:
+        assert s[:len(left)] == left
+        assert s[-1] == "}"
+        return s[len(left):-1]
+    except:
+        return None
+def extract_boxed_answer(solution: str) -> str:
+    """Extract the answer from inside a LaTeX \\boxed{} command"""
+    solution = last_boxed_only_string(solution)
+    solution = remove_boxed(solution)
+    return solution
+def grade_answer_sympy(given_answer: str, ground_truth: str) -> bool:
+    ground_truth_normalized = _normalize(ground_truth)
+    given_normalized = _normalize(given_answer)
+    if ground_truth_normalized is None:
+        return False
+    if ground_truth_normalized == given_normalized:
+        return True
+    if len(given_normalized) == 0:
+        return False
+    ground_truth_elems = split_tuple(ground_truth_normalized)
+    given_elems = split_tuple(given_normalized)
+    if len(ground_truth_elems) > 1 and (
+        ground_truth_normalized[0] != given_normalized[0]
+        or ground_truth_normalized[-1] != given_normalized[-1]
+    ):
+        is_correct = False
+    elif len(ground_truth_elems) != len(given_elems):
+        is_correct = False
+    else:
+        for ground_truth_elem, given_elem in zip(ground_truth_elems, given_elems):
+            if _is_frac(ground_truth_elem) and _is_frac(given_elem):
+                # if fractions aren't reduced, then shouldn't be marked as correct
+                # so, we don't want to allow sympy.simplify in this case
+                is_correct = ground_truth_elem == given_elem
+            elif _str_is_int(ground_truth_elem) != _str_is_int(given_elem):
+                # if the ground truth answer is an integer, we require the given answer to be a strict match (no sympy.simplify)
+                is_correct = False
+            else:
+                is_correct = are_equal_under_sympy(ground_truth_elem, given_elem)
+            if not is_correct:
+                break
+    return is_correct
+def grade_answer_mathd(given_answer: str, ground_truth: str) -> bool:
+    ground_truth_normalized_mathd = mathd_normalize_answer(ground_truth)
+    given_answer_normalized_mathd = mathd_normalize_answer(given_answer)
+    # be at least as lenient as mathd
+    if ground_truth_normalized_mathd == given_answer_normalized_mathd:
+        return True
+    return False
+def extract_answer(passage: str) -> str:
+    if "\\boxed" in passage:
+        return extract_boxed_answer(passage)
+    return None
+def grade_answer_verl(solution_str, ground_truth):
+    if not ground_truth:
+        return False
+    if '\\boxed' in ground_truth:
+        ground_truth = extract_answer(ground_truth)
+    given_answer = extract_answer(solution_str)
+    if given_answer is None:
+        return False
+    return grade_answer_mathd(given_answer, ground_truth) \
+        or grade_answer_sympy(given_answer, ground_truth)

absolute_zero_reasoner/rewards/reward_managers.py ADDED Viewed

	@@ -0,0 +1,898 @@

+import os
+from functools import partial
+from typing import Dict, Any, List, Tuple
+from collections import defaultdict
+import re
+import uuid
+from functools import partial
+import numpy as np
+import pandas as pd
+import torch
+from transformers import AutoTokenizer
+from verl import DataProto
+from verl.protocol import DataProtoItem
+from verl.utils.dataset.rl_dataset import collate_fn
+from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
+import absolute_zero_reasoner.rewards.custom_evaluate as custom_evaluate
+from absolute_zero_reasoner.rewards.code_reward import (
+    parse_code_input_output,
+    parse_inputs_message,
+    parse_code_function,
+    ast_edit_distance,
+    get_code_complexity_reward,
+    get_halstead_reward,
+    get_type_counts_reward,
+)
+from absolute_zero_reasoner.rewards.custom_evaluate import get_format_reward, extract_answer, extract_thought
+from absolute_zero_reasoner.data_construction.process_data import boxed_instruction, instruction_following
+from absolute_zero_reasoner.data_construction.constructor import get_code_problem_predictor_prompt
+from absolute_zero_reasoner.utils.dataset.rl_dataset import RLHFDataset
+from absolute_zero_reasoner.utils.logging_utils.stdout import PrettyPrinter
+from absolute_zero_reasoner.utils.code_utils.checks import check_composite_function, check_no_definitions
+class CodeIORewardManager():
+    """The reward manager."""
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        num_examine: int,
+        split: str,
+        reward_fn_extraction_type: str,
+        math_metric: str,
+        splitter: str,
+        output_path: str,
+        generation_reward_config: Dict[str, Any],
+        debug: bool = False,
+        max_prompt_length: int = 8192,
+        valid_program_filter: str = 'all',
+        batched_estimate: bool = False,
+        extract_code_block: bool = True,
+        num_inputs: int = 10,
+        code_f_reward_type: str = 'accuracy',
+        boxed_retry: bool = False,
+    ):
+        self.tokenizer = tokenizer
+        self.num_examine = num_examine  # the number of batches of decoded responses to print to the console
+        self.compute_score = partial(custom_evaluate.get_reward, math_metric=math_metric, boxed_retry=boxed_retry)
+        self.reward_fn_extraction_type = reward_fn_extraction_type
+        self.split = split
+        self.splitter = splitter
+        self.output_path = output_path
+        self.max_prompt_length = max_prompt_length
+        self.generation_reward_config = generation_reward_config
+        self.valid_program_filter = valid_program_filter
+        self.batched_estimate = batched_estimate
+        self.debug = debug
+        self.extract_code_block = extract_code_block
+        self.use_original_code_as_ref = generation_reward_config.use_original_code_as_ref
+        self.num_inputs = num_inputs
+        self.code_f_reward_type = code_f_reward_type
+        self.boxed_retry = boxed_retry
+    @staticmethod
+    def extract_input_output(extracted_content: str, return_input: bool = True, return_output: bool = False) -> Tuple[str, str]:
+        input_pattern = r"```input\s*\n?(.*?)\n?```"
+        output_pattern = r"```output\s*\n?(.*?)\n?```"
+        assert not (return_input and return_output), "Cannot return both input and output"
+        assert return_input or return_output, "Must return at least one of input or output"
+        # Use flags for case-insensitive matching and dotall
+        flags = re.DOTALL | re.IGNORECASE
+        if return_input:
+            input_matches = list(re.finditer(input_pattern, extracted_content, flags))
+            if not input_matches:
+                # Try alternative pattern without explicit input block
+                input_matches = list(re.finditer(r"# Input:\s*(.*?)(?=\n```|$)", extracted_content, flags))
+            if not input_matches:
+                # Match input() function call and preserve quotes
+                input_matches = list(re.finditer(r'input\s*\((.*?)\)', extracted_content, flags))
+            if not input_matches:
+                # Match <input> tag with optional closing tag, strip spaces
+                input_matches = list(re.finditer(r"<input>\s*(.*?)(?:</input>|\s*$)", extracted_content, flags))
+            if not input_matches:
+                # Match "The input is" pattern case-insensitively
+                input_matches = list(re.finditer(r"the input is\s*(.*?)\.?$", extracted_content, flags))
+            # if still no input matches, use the extracted answer as the input
+            # Don't strip() here to preserve quotes
+            input_snippet = input_matches[-1].group(1) if input_matches else extracted_content
+            return input_snippet
+        if return_output:
+            output_matches = list(re.finditer(output_pattern, extracted_content, flags))
+            if not output_matches:
+                # Try alternative pattern without explicit output block
+                output_matches = list(re.finditer(r"# Output:\s*(.*?)(?=\n```|$)", extracted_content, flags))
+            if not output_matches:
+                # Match output() function call and preserve quotes
+                output_matches = list(re.finditer(r'output\s*\((.*?)\)', extracted_content, flags))
+            if not output_matches:
+                # Match <output> tag with optional closing tag, strip spaces
+                output_matches = list(re.finditer(r"<output>\s*(.*?)(?:</output>|\s*$)", extracted_content, flags))
+            if not output_matches:
+                # Match "The output is" pattern case-insensitively, strip space after "is" and period at end
+                output_matches = list(re.finditer(r"the output is\s*(.*?)\.?$", extracted_content, flags))
+            # if still no output matches, use the extracted answer as the output
+            output_snippet = output_matches[-1].group(1) if output_matches else extracted_content
+            return output_snippet
+    def _get_data_dict(self, data_item: DataProtoItem, problem_type: str, executor, banned_words: List[str], uid: str, banned_assertion_keywords: List[str]) -> Dict:
+        prompt_ids = data_item.batch['prompts']
+        prompt_length = prompt_ids.shape[-1]
+        valid_prompt_length = data_item.batch['attention_mask'][:prompt_length].sum()
+        valid_prompt_ids = prompt_ids[-valid_prompt_length:]
+        response_ids = data_item.batch['responses']
+        valid_response_length = data_item.batch['attention_mask'][prompt_length:].sum()
+        valid_response_ids = response_ids[:valid_response_length]
+        # decode
+        sequences = torch.cat((valid_prompt_ids, valid_response_ids))
+        sequences_str = self.tokenizer.decode(sequences)
+        ground_truth = data_item.non_tensor_batch['reward_model']['ground_truth']
+        data_source = data_item.non_tensor_batch['data_source']
+        extra_info = data_item.non_tensor_batch['extra_info']
+        non_special_tokens_sequences_str = self.tokenizer.decode(self.tokenizer.encode(sequences_str), skip_special_tokens=True)
+        generation = non_special_tokens_sequences_str.split(self.splitter)[1].strip().strip('\"\'')
+        extracted_content = extract_answer(generation, self.reward_fn_extraction_type, boxed_retry=self.boxed_retry)
+        thought = extract_thought(generation)
+        data_dict = {
+            'generation': generation,
+            'data_source': data_source,
+            'ground_truth': ground_truth,
+            'extra_info': extra_info,
+            'non_special_tokens_sequences_str': non_special_tokens_sequences_str,
+            'valid_response_length': valid_response_length,
+            'extracted_content': extracted_content,
+            'thought': thought,
+            'uid': uid,
+        }
+        if problem_type.startswith('gen'):
+            data_dict['references'] = [ref['snippet'] for ref in data_item.non_tensor_batch['extra_info']['chosen_references']]
+            if problem_type != 'gen_code_f':
+                data_dict['composite_functions'] = data_item.non_tensor_batch['extra_info']['composite_functions'].tolist()
+            else:
+                data_dict['imports'] = [ref['imports'] for ref in data_item.non_tensor_batch['extra_info']['chosen_references']]
+            if self.use_original_code_as_ref:
+                data_dict['original_references'] = [ref['original_snippet'] for ref in data_item.non_tensor_batch['extra_info']['chosen_references']]
+        elif problem_type.startswith('pred') and 'code_f' not in problem_type:
+            data_dict['program'] = data_item.non_tensor_batch['problem']
+            data_dict['input'] = data_item.non_tensor_batch['extra_info']['input']
+            data_dict['output'] = data_item.non_tensor_batch['extra_info']['output']
+            data_dict['imports'] = data_item.non_tensor_batch['extra_info'].get('imports', [])
+        elif problem_type.startswith('pred') and 'code_f' in problem_type:
+            data_dict['program'] = data_item.non_tensor_batch['problem']
+            data_dict['given_inputs'] = data_item.non_tensor_batch['extra_info']['given_inputs']
+            data_dict['given_outputs'] = data_item.non_tensor_batch['extra_info']['given_outputs']
+            data_dict['hidden_inputs'] = data_item.non_tensor_batch['extra_info']['hidden_inputs']
+            data_dict['hidden_outputs'] = data_item.non_tensor_batch['extra_info']['hidden_outputs']
+            data_dict['message'] = data_item.non_tensor_batch['extra_info']['message']
+            data_dict['imports'] = data_item.non_tensor_batch['extra_info'].get('imports', [])
+        # if QA task, we only need to check the format
+        if problem_type is None:
+            format_score = get_format_reward(solution_str=generation, extraction_type=self.reward_fn_extraction_type) if self.generation_reward_config.format_reward else 1.
+            data_dict['format_score'] = format_score
+            return data_dict
+        # first go through, we only checking the format
+        elif problem_type.startswith('gen') and 'code_f' not in problem_type:
+            success, result = parse_code_input_output(
+                extracted_content,
+                parse_output=False,
+                remove_after_return=self.generation_reward_config.remove_after_return and self.split == 'train',
+                remove_comments=self.generation_reward_config.remove_comments and self.split == 'train',
+                remove_print=self.generation_reward_config.remove_print and self.split == 'train',
+                reject_multiple_functions=self.generation_reward_config.reject_multiple_functions,
+                f_replace_location=self.generation_reward_config.f_replace_location,
+                reject_test_input_in_code=self.generation_reward_config.reject_test_input_in_code,
+                code_location=self.generation_reward_config.code_location,
+            )
+            if len(data_dict['composite_functions']) > 0 and success:
+                # first, check if the composite function names are redefined in the code, which we do not allow
+                success = check_no_definitions(result['code'], [f'g_{i}' for i in range(len(data_dict['composite_functions']))])
+                if not success: # if the composite function names are redefined, we do not allow the code
+                    data_dict['code_validity'] = False
+                    data_dict['format_score'] = 0.
+                    return data_dict
+                composite_imports = '\n'.join(
+                    '\n'.join(list(d['imports'])) if list(d['imports']) else '' for d in data_dict['composite_functions']
+                ).strip()
+                composite_snippets = '\n\n'.join(d['snippet'] for d in data_dict['composite_functions']).strip()
+                # cache the original code
+                result['original_code'] = result['code']
+                result['code'] = f"{composite_imports}\n\n{composite_snippets}\n\n{result['code']}".strip()
+                # TODO: composite function check
+                success = check_composite_function(
+                    code = result['code'],
+                    composite_functions = [d['snippet'] for d in data_dict['composite_functions']],
+                )
+            if success:
+                code_validity, output = executor.check_all(
+                    code=result['code'],
+                    inputs=result['input'],
+                    banned_keywords=banned_words,
+                    check_determinism=True,
+                    imports=list(set(result['imports'])),
+                    check_error=problem_type == 'gen_code_e',
+                    banned_keywords_for_errors_and_exceptions=banned_assertion_keywords,
+                )
+                if not code_validity:
+                    data_dict['code_validity'] = False
+                    data_dict['format_score'] = 0.
+                    return data_dict
+                # means the code is valid, we append any good programs, but we eval format separately
+                data_dict['answer'] = {
+                    'snippet': result['code'],
+                    'original_snippet': result['original_code'] if 'original_code' in result else result['code'],
+                    'input': result['input'],
+                    'output': output,
+                    'imports': result['imports'],
+                    'thought': thought,
+                    'composite_functions': data_dict['composite_functions']
+                }
+                format_score = get_format_reward(solution_str=generation, extraction_type=self.reward_fn_extraction_type) if self.generation_reward_config.format_reward else 1.
+                data_dict['format_score'] = format_score
+                data_dict['code_validity'] = True
+                return data_dict
+            else:
+                data_dict['code_validity'] = False
+                data_dict['format_score'] = 0.
+                return data_dict
+        elif problem_type == 'gen_code_f':
+            success, result = parse_inputs_message(
+                extracted_content,
+                num_inputs=self.num_inputs,
+            )
+            if success and len(result['inputs']) == self.num_inputs: # for code_f, we need to ensure the number of inputs is correct
+                outputs = []
+                for inpt in result['inputs']:
+                    code_validity, output = executor.check_all(
+                        code=data_dict['references'][0],
+                        inputs=inpt,
+                        banned_keywords=[],
+                        check_determinism=True,
+                        imports=data_dict['imports'][0],
+                        check_error=False,
+                        banned_keywords_for_errors_and_exceptions=[],
+                    )
+                    if not code_validity:
+                        data_dict['code_validity'] = False
+                        data_dict['format_score'] = 0.
+                        return data_dict
+                    outputs.append(output)
+                data_dict['answer'] = {
+                    'snippet': data_dict['references'][0],
+                    'inputs': result['inputs'],
+                    'outputs': outputs,
+                    'message': result['message'],
+                    'imports': data_dict['imports'][0],
+                    'thought': thought,
+                }
+                format_score = get_format_reward(solution_str=generation, extraction_type=self.reward_fn_extraction_type) if self.generation_reward_config.format_reward else 1.
+                data_dict['format_score'] = format_score
+                data_dict['code_validity'] = True
+                return data_dict
+            else:
+                data_dict['code_validity'] = False
+                data_dict['format_score'] = 0.
+                return data_dict
+        # if prediction is the task
+        elif problem_type.startswith('pred'):
+            # Check required blocks
+            if problem_type.endswith('code_i'): # parse input
+                input_snippet = self.extract_input_output(extracted_content, return_input=True, return_output=False) \
+                    if self.extract_code_block else extracted_content
+                if input_snippet is None:
+                    data_dict['format_score'] = 0.
+                    return data_dict
+                format_score = get_format_reward(solution_str=generation, extraction_type=self.reward_fn_extraction_type) if self.generation_reward_config.format_reward else 1.
+                data_dict['format_score'] = format_score
+                data_dict['answer'] = input_snippet
+                return data_dict
+            elif problem_type.endswith('code_o') or problem_type.endswith('code_e'): #  parse output, code_e format is same as code_o
+                output_snippet = self.extract_input_output(extracted_content, return_input=False, return_output=True) \
+                    if self.extract_code_block else extracted_content
+                if output_snippet is None:
+                    data_dict['format_score'] = 0.
+                    return data_dict
+                format_score = get_format_reward(solution_str=generation, extraction_type=self.reward_fn_extraction_type) if self.generation_reward_config.format_reward else 1.
+                data_dict['format_score'] = format_score
+                data_dict['answer'] = output_snippet
+                return data_dict
+            elif problem_type.endswith('code_f'):
+                success, code_snippet = parse_code_function(extracted_content)
+                if not success:
+                    data_dict['format_score'] = 0.
+                    return data_dict
+                format_score = get_format_reward(solution_str=generation, extraction_type=self.reward_fn_extraction_type) if self.generation_reward_config.format_reward else 1.
+                data_dict['format_score'] = format_score
+                data_dict['answer'] = {
+                    'snippet': code_snippet,
+                    'given_inputs': data_dict['given_inputs'],
+                    'given_outputs': data_dict['given_outputs'],
+                    'hidden_inputs': data_dict['hidden_inputs'],
+                    'hidden_outputs': data_dict['hidden_outputs'],
+                    'message': data_dict['message'],
+                    'imports': data_dict['imports'],
+                    'thought': thought,
+                    'gold_program': data_dict['program'],
+                }
+                return data_dict
+            else:
+                raise ValueError(f"Invalid problem type: {problem_type}")
+        else:
+            raise ValueError(f"Invalid problem type: {problem_type}")
+    def __call__(
+        self,
+        data: DataProto,
+        problem_type: str = None,
+        executor = None,
+        rollout_actor_wg = None,
+        banned_words: List[str] = [],
+        banned_assertion_keywords: List[str] = [],
+        n_samples: int = 1,
+        input_type_counters: Dict[str, Dict[str, int]] = None,
+        output_type_counters: Dict[str, Dict[str, int]] = None,
+        error_type_counters: Dict[str, Dict[str, int]] = None,
+    ) -> Tuple[torch.Tensor, Dict, List[Dict], List[Dict]]:
+        """We will expand this function gradually based on the available datasets"""
+        # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn
+        if 'rm_scores' in data.batch.keys():
+            return data.batch['rm_scores']
+        reward_tensor = torch.zeros_like(data.batch['responses'], dtype=torch.float32)
+        all_scores = defaultdict(list)
+        data_dicts = []
+        valid_programs = [] # for gen tasks, we need to store the valid programs for later use, ignore this if prediction task
+        invalid_programs = [] # for gen tasks, we need to store the invalid programs for analysis
+        correct_predictions = []
+        uids = np.array([str(uuid.uuid4()) for _ in range(len(data))], dtype=object)
+        if problem_type is None:
+            problem_types = [d.non_tensor_batch['extra_info']['metric'] for d in data]
+            problem_type = 'pred' # dummy set
+        else:
+            problem_types = [problem_type] * len(data)
+        PrettyPrinter.section_header("Getting Data Dicts")
+        for i in range(len(data)): # get format score
+            data_dict = self._get_data_dict(data[i], problem_types[i], executor, banned_words, uids[i], banned_assertion_keywords)
+            data_dicts.append(data_dict)
+        if problem_type.startswith('gen') and rollout_actor_wg is not None: # get generation rewards
+            PrettyPrinter.section_header("Generating Rewards for Generation Tasks")
+            rewards, valid_programs, invalid_programs = self._get_problem_generator_rewards_and_valid_programs(
+                data_dicts=data_dicts,
+                problem_type=problem_type,
+                n_samples=n_samples,
+                rollout_actor_wg=rollout_actor_wg,
+                executor=executor,
+                input_type_counters=input_type_counters,
+                output_type_counters=output_type_counters,
+                error_type_counters=error_type_counters,
+            )
+            PrettyPrinter.section_header("Combining Rewards for Generation Tasks")
+            for i in range(len(data_dicts)):
+                uid = data_dicts[i]['uid']
+                valid_response_length = data_dicts[i]['valid_response_length']
+                acc_reward = rewards[uid]['accuracy']
+                format_reward = data_dicts[i]['format_score']
+                if format_reward > 0:
+                    if acc_reward > 0:
+                        # Helper function for safe reward combination
+                        def _combine_rewards(acc, intrinsic_components, method):
+                            components = [c for c in intrinsic_components if c is not None]
+                            if method == 'sum':
+                                return acc + sum(components) if components else acc
+                            elif method == 'multiply':
+                                return acc * np.prod([c for c in components]) if components else acc
+                            elif method == 'sum_multiply':
+                                return acc + np.prod([c for c in components]) if components else acc
+                            elif method == 'multiply_sum':
+                                return acc * sum(components) if components else acc
+                            else:
+                                raise ValueError(f"Unknown combination method: {method}")
+                        intrinsic_reward_components = []
+                        if problem_type.endswith('code_f'):
+                            if self.generation_reward_config.f_input_answer_diversity_reward.enabled:
+                                intrinsic_reward_components.append(min(self.generation_reward_config.f_input_answer_diversity_reward.coef * rewards[uid]['input_type_counts'],
+                                    self.generation_reward_config.f_input_answer_diversity_reward.max))
+                            if self.generation_reward_config.f_output_answer_diversity_reward.enabled:
+                                intrinsic_reward_components.append(min(self.generation_reward_config.f_output_answer_diversity_reward.coef * rewards[uid]['output_type_counts'],
+                                    self.generation_reward_config.f_output_answer_diversity_reward.max))
+                        else:
+                            if self.generation_reward_config.complexity_reward.enabled:
+                                intrinsic_reward_components.append(min(self.generation_reward_config.complexity_reward.coef * rewards[uid]['complexity'],
+                                    self.generation_reward_config.complexity_reward.max))
+                            if self.generation_reward_config.mean_edit_distance_reward.enabled:
+                                intrinsic_reward_components.append(min(self.generation_reward_config.mean_edit_distance_reward.coef * rewards[uid]['mean_edit_distance'],
+                                    self.generation_reward_config.mean_edit_distance_reward.max))
+                            if self.generation_reward_config.halstead_reward.enabled:
+                                intrinsic_reward_components.append(min(self.generation_reward_config.halstead_reward.coef * rewards[uid]['halstead'],
+                                    self.generation_reward_config.halstead_reward.max))
+                            if self.generation_reward_config.answer_diversity_reward.enabled:
+                                intrinsic_reward_components.append(min(self.generation_reward_config.answer_diversity_reward.coef * rewards[uid]['type_counts'],
+                                    self.generation_reward_config.answer_diversity_reward.max))
+                        final_reward = _combine_rewards(acc_reward, intrinsic_reward_components, self.generation_reward_config.intrinsic_combine_method)
+                        reward_tensor[i, valid_response_length - 1] = final_reward
+                    else:
+                        reward_tensor[i, valid_response_length - 1] = -0.5
+                else:
+                    reward_tensor[i, valid_response_length - 1] = -1.0
+            all_scores['accuracy'] = [rewards[uid]['accuracy'] for uid in rewards]
+            all_scores['format_score'] = [data_dicts[i]['format_score'] for i in range(len(data))]
+            if 'code_f' not in problem_type:
+                all_scores['answer_diversity'] = [rewards[uid]['type_counts'] for uid in rewards]
+                all_scores['complexity'] = [rewards[uid]['complexity'] for uid in rewards]
+                all_scores['mean_edit_distance'] = [rewards[uid]['mean_edit_distance'] for uid in rewards]
+                all_scores['halstead'] = [rewards[uid]['halstead'] for uid in rewards]
+            else:
+                all_scores['input_answer_diversity'] = [rewards[uid]['input_type_counts'] for uid in rewards]
+                all_scores['output_answer_diversity'] = [rewards[uid]['output_type_counts'] for uid in rewards]
+        elif problem_type.startswith('pred'): # get prediction rewards
+            PrettyPrinter.section_header("Getting Prediction Rewards")
+            all_scores['none_count'] = 0
+            acc_rewards = []
+            for i, data_dict in enumerate(data_dicts):
+                valid_response_length = data_dict['valid_response_length']
+                imports = data_dict['imports']
+                if not problem_type.endswith('code_f'):
+                    answer = data_dict['answer']
+                    gold_input = data_dict['input']
+                    gold_output = data_dict['output']
+                    program = data_dict['program']
+                else:
+                    hidden_inputs = data_dict['hidden_inputs']
+                    hidden_outputs = data_dict['hidden_outputs']
+                if not data_dicts[i]['format_score']: # early stop if the format is not correct
+                    acc_reward = 0.
+                elif problem_types[i].endswith('code_i'):
+                    acc_reward = executor.eval_input_prediction(code=program, gold_output=gold_output, agent_input=answer, imports=list(set(imports)))
+                    # problematic, but we did not encounter too much of this
+                    if acc_reward is None:
+                        all_scores['none_count'] += 1
+                        acc_reward = 0.
+                        print(f"error in pred_code_i, not in [0, 1], acc_reward={acc_reward}\nprogram:\n{program}\n---\nanswer:\n{answer}\n---\nimports:\n{imports}\n---\n")
+                    if acc_reward > 0.0:
+                        correct_predictions.append(data_dict)
+                elif problem_types[i].endswith('code_o'):
+                    acc_reward = executor.eval_output_prediction(code=program, gold_output=gold_output, agent_output=answer, imports=list(set(imports)))
+                    # problematic, but we did not encounter too much of this
+                    if acc_reward is None:
+                        all_scores['none_count'] += 1
+                        acc_reward = 0.
+                        print(f"error in pred_code_o, not in [0, 1], acc_reward={acc_reward}\nprogram:\n{program}\n---\nanswer:\n{answer}\n---\nimports:\n{imports}\n---\n")
+                    if acc_reward > 0.0:
+                        correct_predictions.append(data_dict)
+                elif problem_types[i].endswith('code_e'): # string matching for errors
+                    answer = answer.split(' ')[0].split(':')[0]
+                    if answer.lower() == gold_output.lower():
+                        acc_reward = 1.0
+                        correct_predictions.append(data_dict)
+                    else:
+                        acc_reward = 0.0
+                elif problem_types[i].endswith('code_f'):
+                    input_output_accs = []
+                    program = data_dict['answer']['snippet']
+                    for inpt, outpt in zip(hidden_inputs, hidden_outputs):
+                        input_output_acc = executor.eval_input_prediction(
+                            code=program,
+                            gold_output=outpt,
+                            agent_input=inpt,
+                            imports=list(set(imports)),
+                        )
+                        if input_output_acc is not None:
+                            input_output_accs.append(input_output_acc)
+                    acc_reward = np.mean(input_output_accs) if input_output_accs else 0.0
+                    if self.code_f_reward_type == 'binary':
+                        acc_reward = 1.0 if acc_reward == 1.0 else 0.0
+                    elif self.code_f_reward_type == 'if_one_correct':
+                        acc_reward = 1.0 if acc_reward > 0 else 0.0
+                    # note that if code_f_reward_type==accuracy, it is already handled in the above
+                    if acc_reward > 0:
+                        correct_predictions.append(data_dict)
+                else:
+                    raise ValueError(f"Invalid problem type: {problem_types[i]}")
+                if self.split == 'train':
+                    if data_dicts[i]['format_score'] > 0:
+                        if acc_reward > 0:
+                            reward_tensor[i, valid_response_length - 1] = acc_reward
+                        else:
+                            reward_tensor[i, valid_response_length - 1] = -0.5
+                    else:
+                        reward_tensor[i, valid_response_length - 1] = -1.0
+                elif self.split == 'test': # only acc reward for eval
+                    if acc_reward > 0:
+                        reward_tensor[i, valid_response_length - 1] = 1.0
+                    else:
+                        reward_tensor[i, valid_response_length - 1] = 0.0
+                acc_rewards.append(acc_reward)
+            all_scores['accuracy'] = acc_rewards
+            all_scores['format_score'] = [data_dicts[i]['format_score'] for i in range(len(data))]
+            all_scores['none_ratio'] = all_scores['none_count'] / len(data)
+        return reward_tensor, all_scores, valid_programs, correct_predictions, invalid_programs
+    def _get_problem_generator_rewards_and_valid_programs(
+        self,
+        data_dicts: List[Dict],
+        problem_type: str,
+        n_samples: int,
+        rollout_actor_wg,
+        executor,
+        input_type_counters: Dict[str, Dict[str, int]] = None,
+        output_type_counters: Dict[str, Dict[str, int]] = None,
+        error_type_counters: Dict[str, Dict[str, int]] = None,
+    ) -> Tuple[Dict[str, Dict[str, float]], List[Dict[str, str]]]:
+        """This function uses samples to estimate the accuracy reward for each program, also computes the code complexity and mean edit distance of generated programs.
+            Also returns the valid programs using filters.
+            Args:
+                data_dicts: List[Dict]: A list of data dictionaries.
+                problem_type: str: The type of problem.
+                n_samples: int: The number of samples to use.
+                rollout_actor_wg: RolloutActorWG: The rollout actor.
+                executor: PythonExecutor/CodeBoxExecutor: The executor.
+                type_counters: Dict[str, Dict[str, int]]: The type counters.
+            Returns:
+               rewards: Dict[str, Dict[str, float]]: A dictionary of rewards for each program.
+               valid_programs: List[Dict[str, str]]: A list of valid programs.
+        """
+        if problem_type.endswith('code_i'):
+            type_counters = input_type_counters
+        elif problem_type.endswith('code_o'):
+            type_counters = output_type_counters
+        elif problem_type.endswith('code_e'):
+            type_counters = error_type_counters
+        valid_data_dicts = [data_dict for data_dict in data_dicts if data_dict['code_validity']]
+        uid2valid_dict_idx = {data_dict['uid']: i for i, data_dict in enumerate(valid_data_dicts)}
+        valid_uids = [data_dict['uid'] for data_dict in data_dicts if data_dict['code_validity']]
+        invalid_uids = [data_dict['uid'] for data_dict in data_dicts if not data_dict['code_validity']]
+        assert len(valid_uids) + len(invalid_uids) == len(data_dicts)
+        accuracies = {uid: 1.0 for uid in invalid_uids} # for invalid uids, we give maximum accuracy to the model
+        rewards = defaultdict(dict)
+        valid_programs = []
+        invalid_programs = []
+        if len(valid_uids) > 0:
+            if self.reward_fn_extraction_type.startswith('boxed'):
+                instruction_template = boxed_instruction
+            elif self.reward_fn_extraction_type.startswith('answer'):
+                instruction_template = instruction_following
+            elif self.reward_fn_extraction_type.startswith('none'):
+                instruction_template = '{}'
+            else:
+                raise ValueError(f"Invalid instruction type: {self.reward_fn_extraction_type}")
+            prompts = []
+            if problem_type.endswith('code_i'):
+                pt = 'code_i'
+            elif problem_type.endswith('code_o'):
+                pt = 'code_o'
+            elif problem_type.endswith('code_e'):
+                pt = 'code_e'
+            elif problem_type.endswith('code_f'):
+                pt = 'code_f'
+            else:
+                raise ValueError(f"Invalid problem type: {problem_type}")
+            for data_dict in valid_data_dicts:
+                if pt == 'code_f':
+                    num_given_inputs = len(data_dict['answer']['inputs']) // 2
+                    num_given_outputs = len(data_dict['answer']['outputs']) // 2
+                    data_dict['answer']['given_inputs'] = data_dict['answer']['inputs'][:num_given_inputs]
+                    data_dict['answer']['given_outputs'] = data_dict['answer']['outputs'][:num_given_outputs]
+                    data_dict['answer']['hidden_inputs'] = data_dict['answer']['inputs'][num_given_inputs:]
+                    data_dict['answer']['hidden_outputs'] = data_dict['answer']['outputs'][num_given_outputs:]
+                    io_prompt = instruction_template.format(
+                        get_code_problem_predictor_prompt(
+                            problem_type=problem_type,
+                            snippet=data_dict['answer']['snippet'],
+                            message=data_dict['answer']['message'],
+                            input_output_pairs=zip(data_dict['answer']['given_inputs'], data_dict['answer']['given_outputs']),
+                        )
+                    )
+                else:
+                    io_prompt = instruction_template.format(
+                        get_code_problem_predictor_prompt(
+                            problem_type=pt,
+                            snippet=data_dict['answer']['snippet'],
+                            input_args=data_dict['answer']['input'],
+                            output=data_dict['answer']['output'],
+                        )
+                    )
+                prompts_dict = {
+                    'prompt': [{'role': 'user', 'content': io_prompt}],
+                    'uid': data_dict['uid'],
+                    'problem': data_dict['answer'],
+                    'data_source': data_dict['data_source'],
+                    'ground_truth': data_dict['answer']['output'] if pt != 'code_f' else data_dict['answer']['snippet'],
+                    'extra_info': data_dict['extra_info'],
+                    'program': data_dict['answer']['snippet'],
+                    'imports': data_dict['answer']['imports'],
+                    'references': data_dict['references'],
+                }
+                if pt == 'code_f':
+                    prompts_dict.update({
+                        'given_inputs': data_dict['answer']['given_inputs'],
+                        'given_outputs': data_dict['answer']['given_outputs'],
+                        'hidden_inputs': data_dict['answer']['hidden_inputs'],
+                        'hidden_outputs': data_dict['answer']['hidden_outputs'],
+                        'message': data_dict['answer']['message'],
+                    })
+                else:
+                    prompts_dict.update({
+                        'input': data_dict['answer']['input'],
+                        'output': data_dict['answer']['output'],
+                        'original_program': data_dict['answer']['original_snippet'],
+                        'composite_functions': data_dict['answer']['composite_functions'],
+                    })
+                prompts.append(prompts_dict)
+            # sampling to estimate the accuracy
+            PrettyPrinter.section_header("Sampling to Estimate Accuracy")
+            prompts = prompts * n_samples # repeat the prompts n_samples times
+            pd.DataFrame(prompts).to_parquet(f'{self.output_path}/temp.parquet') # RLHFDataset expects parquet
+            temp_data = RLHFDataset(
+                parquet_files=f'{self.output_path}/temp.parquet',
+                tokenizer=self.tokenizer,
+                prompt_key='prompt',
+                max_prompt_length=self.max_prompt_length,
+                filter_prompts=True,
+                return_raw_chat=False,
+                truncation='error'
+            )
+            os.remove(f'{self.output_path}/temp.parquet') # we do not need this file after we load in the dataset
+            sampler = torch.utils.data.SequentialSampler(data_source=temp_data)
+            dataloader = torch.utils.data.DataLoader(
+                dataset=temp_data,
+                batch_size=len(temp_data),
+                drop_last=False,
+                shuffle=False,
+                collate_fn=collate_fn,
+                sampler=sampler,
+            )
+            assert len(dataloader) == 1
+            data = next(iter(dataloader))
+            batch = DataProto.from_single_dict(data)
+            gen_batch = batch.pop(['input_ids', 'attention_mask', 'position_ids'])
+            gen_batch.meta_info = {
+                'eos_token_id': self.tokenizer.eos_token_id,
+                'pad_token_id': self.tokenizer.pad_token_id,
+                'recompute_log_prob': False,
+                'do_sample': True,
+                'validate': True,
+            }
+            # pad to be divisible by dp_size
+            gen_batch_padded, pad_size = pad_dataproto_to_divisor(gen_batch, rollout_actor_wg.world_size)
+            output_gen_batch_padded = rollout_actor_wg.generate_sequences(gen_batch_padded)
+            # unpad
+            output_gen_batch = unpad_dataproto(output_gen_batch_padded, pad_size=pad_size)
+            print('validation generation end')
+            # Store generated outputs
+            batch = batch.union(output_gen_batch)
+            batched_responses = []
+            for b in batch:
+                batch_dict = {
+                        'extracted_answers': extract_answer(
+                            self.tokenizer.decode(b.batch['responses'], skip_special_tokens=True),
+                            self.reward_fn_extraction_type,
+                            boxed_retry=self.boxed_retry,
+                        ),
+                        'uid': b.non_tensor_batch['uid'],
+                        'problem': b.non_tensor_batch['problem'],
+                        'data_source': b.non_tensor_batch['data_source'],
+                        'extra_info': b.non_tensor_batch['extra_info'],
+                        'program': b.non_tensor_batch['program'],
+                        'references': b.non_tensor_batch['references'],
+                        'imports': b.non_tensor_batch['imports'],
+                    }
+                if pt == 'code_f':
+                    batch_dict.update({
+                        'given_inputs': b.non_tensor_batch['given_inputs'],
+                        'given_outputs': b.non_tensor_batch['given_outputs'],
+                        'hidden_inputs': b.non_tensor_batch['hidden_inputs'],
+                        'hidden_outputs': b.non_tensor_batch['hidden_outputs'],
+                        'message': b.non_tensor_batch['message'],
+                    })
+                else:
+                    batch_dict.update({
+                        'input': b.non_tensor_batch['input'],
+                        'output': b.non_tensor_batch['output'],
+                        'original_program': b.non_tensor_batch['original_program'],
+                        'composite_functions': b.non_tensor_batch['composite_functions'].tolist(),
+                    })
+                batched_responses.append(batch_dict)
+            df = pd.DataFrame(batched_responses)
+            # estimating accuracy using python executor
+            PrettyPrinter.section_header("Estimating Accuracy Using Python Executor")
+            for valid_uid in valid_uids:
+                df_valid = df[df['uid'] == valid_uid]
+                if df_valid.empty: # the prompt got filtered out TODO: check
+                    accuracies[valid_uid] = 0.0
+                    continue
+                if pt != 'code_f':
+                    answers = [self.extract_input_output(
+                        answer,
+                        return_input=problem_type.endswith('code_i'),
+                        return_output=(problem_type.endswith('code_o') or problem_type.endswith('code_e')) # code_e output format is same as code_o
+                    ) for answer in df_valid['extracted_answers'].tolist()]
+                else:
+                    answers = [parse_code_function(answer) for answer in df_valid['extracted_answers'].tolist()]
+                answer_cache = {} # for the same uid, the answer is the same and the program is assumed to be deterministic, therefore we cache the answer -> accuracy mapping
+                if pt == 'code_f':
+                    hidden_outputs = df_valid['hidden_outputs'].tolist()[0].tolist()
+                    hidden_inputs = df_valid['hidden_inputs'].tolist()[0].tolist()
+                else:
+                    gold_output = df_valid['output'].tolist()[0]
+                    program = df_valid['program'].tolist()[0]
+                    # gold_input = df_valid['input'].tolist()[0]
+                imports = df_valid['imports'].tolist()[0]
+                problem_accuracies = []
+                if problem_type.endswith('code_i'):
+                    if self.batched_estimate:
+                        problem_accuracies = executor.eval_k_input_prediction(code=program, gold_output=gold_output, k_agent_inputs=answers, imports=list(set(imports)))
+                    else:
+                        for answer in answers:
+                            if answer in answer_cache:
+                                problem_accuracies.append(answer_cache[answer])
+                                continue
+                            acc_reward = executor.eval_input_prediction(code=program, gold_output=gold_output, agent_input=answer, imports=list(set(imports)))
+                            if acc_reward is not None:
+                                problem_accuracies.append(acc_reward)
+                            answer_cache[answer] = acc_reward
+                        # if self.debug:
+                        #     batched_problem_accuracies = executor.eval_k_input_prediction(code=program, gold_output=gold_output, k_agent_inputs=answers, imports=list(set(imports)))
+                        #     assert np.mean(batched_problem_accuracies) == np.mean(problem_accuracies), f"Gen I batch accuracy: {np.mean(batched_problem_accuracies)}, Single accuracy: {np.mean(problem_accuracies)}"
+                elif problem_type.endswith('code_o'):
+                    if self.batched_estimate:
+                        problem_accuracies = executor.eval_k_output_prediction(code=program, gold_output=gold_output, k_agent_outputs=answers, imports=list(set(imports)))
+                    else:
+                        for answer in answers:
+                            if answer in answer_cache:
+                                problem_accuracies.append(answer_cache[answer])
+                                continue
+                            acc_reward = executor.eval_output_prediction(code=program, gold_output=gold_output, agent_output=answer, imports=list(set(imports)))
+                            if acc_reward is not None:
+                                problem_accuracies.append(acc_reward)
+                            answer_cache[answer] = acc_reward
+                        # if self.debug:
+                        #     batched_problem_accuracies = executor.eval_k_output_prediction(code=program, gold_output=gold_output, k_agent_outputs=answers, imports=list(set(imports)))
+                        #     assert np.mean(batched_problem_accuracies) == np.mean(problem_accuracies), f"Gen O batch accuracy: {np.mean(batched_problem_accuracies)}, Single accuracy: {np.mean(problem_accuracies)}"
+                elif problem_type.endswith('code_e'): # string matching for errors
+                    for answer in answers:
+                        answer = answer.split(' ')[0].split(':')[0]
+                        if answer.lower() == gold_output.lower():
+                            problem_accuracies.append(1.0)
+                        else:
+                            problem_accuracies.append(0.0)
+                elif problem_type.endswith('code_f'):
+                    for parsed, answer in answers: # for each input/output set, we sampled n codes to estimate the accuracy
+                        if not parsed: # the code answer is not parsed, we assume the code is not valid
+                            problem_accuracies.append(0.0)
+                            continue
+                        code_accuracies = []
+                        for inpt, outpt in zip(hidden_inputs, hidden_outputs):
+                            code_accuracies.append(executor.eval_input_prediction(code=answer, gold_output=outpt, agent_input=inpt, imports=list(set(imports))))
+                        answer_acc = np.mean([a for a in code_accuracies if a is not None]) if code_accuracies else 0.0
+                        if self.code_f_reward_type == 'binary':
+                            problem_accuracies.append(1.0 if answer_acc == 1.0 else 0.0)
+                        elif self.code_f_reward_type == 'if_one_correct':
+                            problem_accuracies.append(1.0 if answer_acc > 0 else 0.0)
+                        elif self.code_f_reward_type == 'accuracy':
+                            problem_accuracies.append(answer_acc)
+                        else:
+                            raise ValueError(f"Invalid code_f_reward_type: {self.code_f_reward_type}")
+                accuracies[valid_uid] = sum(problem_accuracies) / len(problem_accuracies) if problem_accuracies else 0.0
+                # filtering valid programs
+                if self.valid_program_filter == 'all':
+                    valid_programs.append(valid_data_dicts[uid2valid_dict_idx[valid_uid]]['answer'])
+                elif self.valid_program_filter == 'non_one':
+                    if accuracies[valid_uid] < 1.0:
+                        valid_programs.append(valid_data_dicts[uid2valid_dict_idx[valid_uid]]['answer'])
+                elif self.valid_program_filter == 'non_extremes':
+                    if accuracies[valid_uid] > 0.0 and accuracies[valid_uid] < 1.0:
+                        valid_programs.append(valid_data_dicts[uid2valid_dict_idx[valid_uid]]['answer'])
+                else:
+                    raise ValueError(f"Invalid valid program filter: {self.valid_program_filter}")
+        # collecting invalid programs for analysis
+        invalid_data_dicts = [data_dict for data_dict in data_dicts if not data_dict['code_validity']]
+        for i, invalid_data_dict in enumerate(invalid_data_dicts):
+            # Create a unique label for each invalid problem
+            problem_label = f"{problem_type}_invalid_{invalid_data_dict.get('uid', i)}"
+            # Store the full LLM prompt and response for analysis
+            invalid_program = {
+                'problem_id': problem_label,
+                'llm_prompt': invalid_data_dict.get('prompt', ''),
+                'llm_response': invalid_data_dict.get('response', ''),
+                'invalid_reason': invalid_data_dict.get('error_message', 'Parsing or validation failed'),
+                'format_score': invalid_data_dict.get('format_score', 0.0),
+                'problem_type': problem_type,
+                'timestamp': invalid_data_dict.get('timestamp', ''),
+                'raw_data': invalid_data_dict  # Keep original data for debugging
+            }
+            invalid_programs.append(invalid_program)
+        # getting other rewards
+        PrettyPrinter.section_header("Getting Other Rewards")
+        # outputting rewards
+        for d in data_dicts:
+            uid = d['uid']
+            if self.generation_reward_config.generation_accuracy_convertion == 'one_minus':
+                rewards[uid]['accuracy'] = (1 - accuracies[uid]) if accuracies[uid] > 0 else 0.0
+            elif self.generation_reward_config.generation_accuracy_convertion == 'inverse':
+                rewards[uid]['accuracy'] = 1 - accuracies[uid]
+            else:
+                raise ValueError(f"Invalid generation accuracy convertion: {self.generation_reward_config.generation_accuracy_convertion}")
+        if not problem_type.endswith('code_f'):
+            code_key = 'original_snippet' if self.use_original_code_as_ref else 'snippet'
+            reference_key = 'original_references' if self.use_original_code_as_ref else 'references'
+            if problem_type.endswith('code_i'):
+                type_counter_key = 'input'
+            elif problem_type.endswith('code_o'):
+                type_counter_key = 'output'
+            elif problem_type.endswith('code_e'):
+                type_counter_key = 'error'
+            else:
+                raise ValueError(f"Invalid problem type: {problem_type}")
+            for data_dict in data_dicts:
+                rewards[data_dict['uid']]['complexity'] = get_code_complexity_reward(data_dict['answer'][code_key]) if 'answer' in data_dict else 0.0
+            for data_dict in data_dicts:
+                rewards[data_dict['uid']]['mean_edit_distance'] = np.mean([ast_edit_distance(data_dict['answer'][code_key], ref) for ref in data_dict[reference_key]]) if 'answer' in data_dict else 0.0
+            for data_dict in data_dicts:
+                rewards[data_dict['uid']]['halstead'] = get_halstead_reward(data_dict['answer'][code_key]) if 'answer' in data_dict else 0.0
+            for data_dict in data_dicts:
+                rewards[data_dict['uid']]['type_counts'] = get_type_counts_reward(
+                    data_dict['answer'][type_counter_key],
+                    type_counters,
+                    hierarchical=self.generation_reward_config.answer_diversity_reward.hierarchical
+                ) if 'answer' in data_dict else 0.0
+            if self.debug:
+                for data_dict in data_dicts:
+                    if 'answer' in data_dict:
+                        continue
+        else:
+            for data_dict in data_dicts:
+                rewards[data_dict['uid']]['input_type_counts'] = []
+                rewards[data_dict['uid']]['output_type_counts'] = []
+                if 'answer' in data_dict:
+                    for inpt, outpt in zip(data_dict['answer']['inputs'], data_dict['answer']['outputs']):
+                        rewards[data_dict['uid']]['input_type_counts'].append(get_type_counts_reward(
+                            inpt,
+                            input_type_counters,
+                            hierarchical=self.generation_reward_config.answer_diversity_reward.hierarchical
+                        ))
+                        rewards[data_dict['uid']]['output_type_counts'].append(get_type_counts_reward(
+                            outpt,
+                            output_type_counters,
+                            hierarchical=self.generation_reward_config.answer_diversity_reward.hierarchical
+                        ))
+                    rewards[data_dict['uid']]['input_type_counts'] = np.mean(rewards[data_dict['uid']]['input_type_counts'])
+                    rewards[data_dict['uid']]['output_type_counts'] = np.mean(rewards[data_dict['uid']]['output_type_counts'])
+                else:
+                    rewards[data_dict['uid']]['input_type_counts'] = 0.0
+                    rewards[data_dict['uid']]['output_type_counts'] = 0.0
+        # turn into normal dict
+        rewards = dict(rewards)
+        return rewards, valid_programs, invalid_programs

absolute_zero_reasoner/rewards/ttrlvr_reward_manager.py ADDED Viewed

	@@ -0,0 +1,244 @@

+"""
+TTRLVR Reward Manager for AZR Integration
+TTRLVR의 complete_pipeline.py에 있는 _compute_rewards_with_azr 로직을
+완전히 동일하게 AZR에서 사용할 수 있도록 통합
+"""
+import re
+from typing import Dict, Any, List, Optional, Tuple
+import torch
+from transformers import AutoTokenizer
+from .reward_managers import CodeIORewardManager
+from ..utils.code_utils.python_executor import PythonExecutor
+from ..utils.code_utils.templates import EVAL_INPUT_PREDICTION_TEMPLATE
+class TTRLVRRewardManager(CodeIORewardManager):
+    """TTRLVR 전용 Reward Manager - complete_pipeline.py의 로직 그대로 사용"""
+    def __init__(self, tokenizer: AutoTokenizer, **kwargs):
+        super().__init__(tokenizer=tokenizer, **kwargs)
+        self.executor = PythonExecutor()
+    def compute_rewards(self,
+                       prompts: List[str],
+                       responses: List[str],
+                       metadata: List[Dict[str, Any]]) -> List[float]:
+        """
+        TTRLVR complete_pipeline.py의 _compute_rewards_with_azr과 동일한 로직
+        Args:
+            prompts: 프롬프트 리스트
+            responses: 모델이 생성한 응답 리스트
+            metadata: 각 task의 메타데이터 (task_type, evaluation_data 등)
+        Returns:
+            rewards: 각 응답에 대한 reward 리스트
+        """
+        rewards = []
+        # 간단한 디버깅 정보 출력
+        if metadata and len(metadata) > 0:
+            # Task type 분포 확인
+            task_types = [m.get('task_type', 'unknown') for m in metadata]
+            from collections import Counter
+            task_count = Counter(task_types)
+            print(f"\n[TTRLVR] Processing batch - Task distribution: {dict(task_count)}")
+        for prompt, response, meta in zip(prompts, responses, metadata):
+            task_type = meta.get('task_type', 'unknown')
+            evaluation_data = meta.get('evaluation_data', {})
+            expected = meta.get('expected_solution', '')
+            # complete_pipeline.py:458과 동일
+            extracted_answer = self._extract_answer_by_task_type(response, task_type)
+            # 실제 코드 실행 기반 평가 (complete_pipeline.py:461-584와 동일)
+            try:
+                if task_type == 'abduction':
+                    # complete_pipeline.py:462-548 그대로
+                    code = evaluation_data['function_code']
+                    expected_output = evaluation_data['expected_output']
+                    agent_input = extracted_answer
+                    # 함수 정의만 추출
+                    code = self._extract_function_definition(code)
+                    # 함수명 추출 및 f로 변경
+                    func_name_match = re.search(r'def\s+(\w+)\s*\(', code)
+                    if func_name_match:
+                        original_func_name = func_name_match.group(1)
+                        code = re.sub(r'def\s+' + re.escape(original_func_name) + r'\s*\(', 'def f(', code)
+                    # expected_output을 실제 값으로 변환
+                    try:
+                        expected_output_value = eval(expected_output)
+                    except:
+                        expected_output_value = expected_output
+                    # EVAL_INPUT_PREDICTION_TEMPLATE 사용
+                    try:
+                        code_snippet = EVAL_INPUT_PREDICTION_TEMPLATE.format(
+                            code=code,
+                            gold_output=expected_output_value,
+                            agent_input=agent_input
+                        )
+                        result, status = self.executor.apply(code_snippet)
+                        if 'error' in status.lower():
+                            accuracy = 0.0
+                        else:
+                            # 실행 결과와 expected output 비교
+                            try:
+                                if isinstance(result, bool):
+                                    agent_output = result
+                                else:
+                                    agent_output = eval(result)
+                                accuracy = 1.0 if agent_output else 0.0
+                            except:
+                                accuracy = 0.0
+                    except:
+                        accuracy = 0.0
+                elif task_type == 'deduction':
+                    # complete_pipeline.py:549-558 그대로
+                    expected_output = expected
+                    agent_output = extracted_answer
+                    # 간단한 eval 비교
+                    try:
+                        accuracy = 1.0 if eval(expected_output) == eval(agent_output) else 0.0
+                    except:
+                        accuracy = 0.0
+                elif task_type == 'induction':
+                    # complete_pipeline.py:560-575 그대로
+                    input_output_pairs = evaluation_data.get('input_output_pairs', [])
+                    agent_code = extracted_answer
+                    # numpy array 처리: JSON/Parquet 저장으로 인한 데이터 형식 변환 해결
+                    import numpy as np
+                    if isinstance(input_output_pairs, np.ndarray):
+                        # 이중 중첩된 numpy array 해제 (parquet 저장 시 발생)
+                        # array([array(['input', 'output'])]) -> array(['input', 'output'])
+                        if len(input_output_pairs) == 1 and isinstance(input_output_pairs[0], np.ndarray):
+                            # 단일 테스트 케이스: array를 리스트로 변환 후 다시 리스트로 감싸기
+                            inner_array = input_output_pairs[0]
+                            input_output_pairs = [inner_array.tolist()]  # [['input', 'output']]
+                        else:
+                            # 여러 테스트 케이스: 각각을 리스트로 변환
+                            input_output_pairs = [item.tolist() if isinstance(item, np.ndarray) else item
+                                                 for item in input_output_pairs]
+                    # 리스트인데 직접 ['input', 'output'] 형태인 경우 처리
+                    if isinstance(input_output_pairs, list) and len(input_output_pairs) == 2:
+                        if isinstance(input_output_pairs[0], str) and isinstance(input_output_pairs[1], str):
+                            # 단일 테스트 케이스로 감싸기
+                            input_output_pairs = [input_output_pairs]
+                    # 모든 input-output 쌍에 대해 테스트
+                    accuracies = []
+                    for i, pair in enumerate(input_output_pairs):
+                        try:
+                            # numpy array인 경우 리스트로 변환
+                            if isinstance(pair, np.ndarray):
+                                pair = pair.tolist()
+                            # 리스트 또는 튜플에서 입력과 출력 추출
+                            if isinstance(pair, (list, tuple)) and len(pair) >= 2:
+                                test_input, expected_output = pair[0], pair[1]
+                            else:
+                                # 잘못된 형식의 경우 0점 처리
+                                accuracies.append(0.0)
+                                continue
+                            # 실제 코드 실행 및 평가
+                            accuracy = self.executor.eval_input_prediction(agent_code, expected_output, test_input)
+                            accuracies.append(accuracy if accuracy is not None else 0.0)
+                        except Exception as e:
+                            # 예외 발생 시 0점 처리
+                            accuracies.append(0.0)
+                    # 평균 정확도 계산
+                    accuracy = sum(accuracies) / len(accuracies) if accuracies else 0.0
+                else:
+                    # complete_pipeline.py:578-579 그대로
+                    accuracy = 1.0 if expected.strip() == extracted_answer.strip() else 0.0
+            except Exception as e:
+                accuracy = 0.0
+            rewards.append(accuracy)
+        # 계산된 rewards 요약 (간단히)
+        if rewards:
+            import numpy as np
+            mean_reward = np.mean(rewards)
+            print(f"[TTRLVR] Batch rewards - Mean: {mean_reward:.4f}, Min: {min(rewards):.4f}, Max: {max(rewards):.4f}")
+        return rewards
+    def _extract_answer_by_task_type(self, llm_response: str, task_type: str) -> str:
+        """complete_pipeline.py의 _extract_answer_by_task_type와 동일"""
+        # <answer>...</answer> 태그 추출
+        match = re.search(r'<answer>(.*?)</answer>', llm_response, re.DOTALL)
+        if match:
+            answer_content = match.group(1).strip()
+            # Task 타입별 후처리
+            if task_type == 'induction':
+                # 코드 블록에서 def f(...) 추출
+                if 'def f(' in answer_content:
+                    return answer_content
+                # 코드 블록 마커 제거
+                answer_content = answer_content.replace('```python', '').replace('```', '').strip()
+                return answer_content
+            elif task_type == 'deduction':
+                # 출력값 정리
+                return answer_content.strip()
+            elif task_type == 'abduction':
+                # 입��값 정리
+                return answer_content.strip()
+        # 태그가 없으면 전체 응답 반환
+        return llm_response.strip()
+    def _extract_function_definition(self, code: str) -> str:
+        """complete_pipeline.py:470-502와 동일한 함수"""
+        lines = code.split('\n')
+        import_lines = []
+        func_lines = []
+        in_function = False
+        base_indent = None
+        for line in lines:
+            # import 문 수집
+            if line.strip().startswith('from ') or line.strip().startswith('import '):
+                import_lines.append(line)
+            # 함수 정의 시작
+            elif line.strip().startswith('def '):
+                in_function = True
+                base_indent = len(line) - len(line.lstrip())
+                func_lines.append(line)
+            elif in_function:
+                # 빈 줄이거나 함수 내부인 경우
+                if line.strip() == '':
+                    func_lines.append(line)
+                elif line.startswith(' ' * (base_indent + 1)) or line.startswith('\t'):
+                    # 함수 내부 (들여쓰기가 더 깊음)
+                    func_lines.append(line)
+                else:
+                    # 함수 외부 코드 (assert 문 등) - 중단
+                    break
+        # import문과 함수를 합쳐서 반환
+        if import_lines:
+            return '\n'.join(import_lines) + '\n\n' + '\n'.join(func_lines)
+        else:
+            return '\n'.join(func_lines)

absolute_zero_reasoner/testtime/__init__.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""
+TestTime RLVR Components
+This module contains all TestTime-specific components adapted from AZR:
+- BenchmarkProblemLoader: 벤치마크 문제 로딩
+- IPOTripleExtractor: (Input, Program, Output) 트리플 추출
+- TestTimeTaskGenerator: Induction/Deduction/Abduction 태스크 생성
+- TestTimeRLVRTrainer: TestTime 특화 RLVR 학습
+- TestTimeRewardManager: TestTime 보상 계산
+- TestTimeLogger: 포괄적 로깅 시스템
+"""
+from .benchmark_loader import BenchmarkProblemLoader
+from .solution_generator import InitialSolutionGenerator
+from .ipo_extractor import IPOTripleExtractor
+from .task_generator import TestTimeTaskGenerator
+from .logger import TestTimeLogger
+from .config import TestTimeConfig, BenchmarkConfig
+# 향후 구현 예정
+# from .testtime_trainer import TestTimeRLVRTrainer
+# from .reward_manager import TestTimeRewardManager
+__all__ = [
+    'BenchmarkProblemLoader',
+    'InitialSolutionGenerator',
+    'IPOTripleExtractor',
+    'TestTimeTaskGenerator',
+    'TestTimeLogger',
+    'TestTimeConfig',
+    'BenchmarkConfig'
+    # 'TestTimeRLVRTrainer',
+    # 'TestTimeRewardManager',
+]

absolute_zero_reasoner/testtime/benchmark_loader.py ADDED Viewed

	@@ -0,0 +1,223 @@

+"""
+Benchmark Problem Loader
+AZR 기반 TestTime RLVR을 위한 벤치마크 문제 로딩 시스템
+기존 Test-Time-RLVR의 load_humaneval_problem 함수를 확장
+"""
+import json
+import os
+from typing import Dict, List, Any, Tuple, Optional
+from pathlib import Path
+from .config import BenchmarkConfig, TestTimeConfig
+from .logger import TestTimeLogger
+class BenchmarkProblemLoader:
+    """벤치마크 문제 로딩 및 관리 (EvalPlus 표준 방식 사용)"""
+    def __init__(self, config: TestTimeConfig, logger: Optional[TestTimeLogger] = None):
+        self.config = config
+        self.logger = logger or TestTimeLogger()
+        self.loaded_problems = {}  # 캐시
+        self.evalplus_cache = {}  # EvalPlus 데이터 캐시
+    def _load_evalplus_data(self, benchmark_name: str) -> Dict[str, Dict[str, Any]]:
+        """EvalPlus 데이터 로드 및 캐시"""
+        if benchmark_name in self.evalplus_cache:
+            return self.evalplus_cache[benchmark_name]
+        try:
+            if benchmark_name == 'mbpp':
+                from evalplus.data.mbpp import get_mbpp_plus
+                problems = get_mbpp_plus()  # 자동으로 mbpp_deserialize_inputs 적용됨
+                self.logger.log_info(f"✅ MBPP+ EvalPlus 데이터 로드 성공: {len(problems)}개 문제")
+            elif benchmark_name == 'humaneval':
+                from evalplus.data.humaneval import get_human_eval_plus
+                problems = get_human_eval_plus()  # EvalPlus 표준 방식
+                self.logger.log_info(f"✅ HumanEval+ EvalPlus 데이터 로드 성공: {len(problems)}개 문제")
+            else:
+                raise ValueError(f"Unsupported benchmark for EvalPlus: {benchmark_name}")
+            self.evalplus_cache[benchmark_name] = problems
+            return problems
+        except Exception as e:
+            self.logger.log_error(f"❌ {benchmark_name.upper()}+ EvalPlus 로딩 실패: {e}")
+            return {}
+    def load_problem(self, benchmark_config: BenchmarkConfig, problem_id: str) -> Dict[str, Any]:
+        """특정 벤치마크 문제 로드 (EvalPlus 표준 방식 우선 사용)"""
+        cache_key = f"{benchmark_config.name}_{problem_id}"
+        if cache_key in self.loaded_problems:
+            return self.loaded_problems[cache_key]
+        # EvalPlus 방식 시도
+        if benchmark_config.name in ['mbpp', 'humaneval']:
+            evalplus_problems = self._load_evalplus_data(benchmark_config.name)
+            if problem_id in evalplus_problems:
+                problem = evalplus_problems[problem_id].copy()
+                # 추가 메타데이터 설정
+                problem['benchmark_name'] = benchmark_config.name
+                problem['benchmark_config'] = benchmark_config
+                # 캐시에 저장
+                self.loaded_problems[cache_key] = problem
+                self.logger.log_info(f"✅ Problem loaded: {problem_id} from {benchmark_config.name} (EvalPlus)")
+                return problem
+        # Fallback: 기존 방식
+        self.logger.log_info(f"⚠️  {problem_id} EvalPlus 로딩 실패, 기존 방식 사용")
+        problem_file = benchmark_config.data_path
+        # 파일 존재 확인
+        if not os.path.exists(problem_file):
+            raise FileNotFoundError(f"Benchmark file not found: {problem_file}")
+        # JSONL 파일 로드 (기존 방식과 동일)
+        with open(problem_file, 'r', encoding='utf-8') as f:
+            problems = [json.loads(line) for line in f]
+        # 문제 ID로 검색
+        for problem in problems:
+            if problem['task_id'] == problem_id:
+                # 추가 메타데이터 설정
+                problem['benchmark_name'] = benchmark_config.name
+                problem['benchmark_config'] = benchmark_config
+                # 캐시에 저장
+                self.loaded_problems[cache_key] = problem
+                self.logger.log_info(f"✅ Problem loaded: {problem_id} from {benchmark_config.name} (Original)")
+                return problem
+        raise ValueError(f"Problem {problem_id} not found in {problem_file}")
+    def load_problem_batch(self, benchmark_config: BenchmarkConfig,
+                          problem_ids: List[str]) -> List[Dict[str, Any]]:
+        """여러 문제 배치 로딩"""
+        problems = []
+        for problem_id in problem_ids:
+            try:
+                problem = self.load_problem(benchmark_config, problem_id)
+                problems.append(problem)
+            except Exception as e:
+                self.logger.log_error(f"Failed to load {problem_id}: {e}")
+        return problems
+    def get_test_cases(self, problem: Dict[str, Any]) -> List[Tuple[str, str]]:
+        """문제에서 테스트 케이스 추출"""
+        test_cases = []
+        # 기본 테스트 케이스 (test 필드)
+        if 'test' in problem:
+            test_code = problem['test']
+            # assert 문에서 입력-출력 쌍 추출
+            test_cases.extend(self._parse_assert_statements(test_code))
+        # Plus 테스트 케이스 (plus_input, plus_output)
+        if 'plus_input' in problem and 'plus_output' in problem:
+            plus_inputs = problem['plus_input']
+            plus_outputs = problem['plus_output']
+            if isinstance(plus_inputs, str):
+                plus_inputs = json.loads(plus_inputs)
+            if isinstance(plus_outputs, str):
+                plus_outputs = json.loads(plus_outputs)
+            for inp, out in zip(plus_inputs, plus_outputs):
+                test_cases.append((str(inp), str(out)))
+        return test_cases
+    def _parse_assert_statements(self, test_code: str) -> List[Tuple[str, str]]:
+        """assert 문에서 입력-출력 쌍 추출"""
+        import re
+        test_cases = []
+        lines = test_code.strip().split('\n')
+        for line in lines:
+            line = line.strip()
+            if line.startswith('assert '):
+                # assert function(args) == expected 형태 파싱
+                match = re.match(r'assert\s+(\w+)\(([^)]*)\)\s*==\s*(.+)', line)
+                if match:
+                    func_name, args, expected = match.groups()
+                    test_cases.append((args.strip(), expected.strip()))
+        return test_cases
+    def validate_solution(self, problem: Dict[str, Any], solution: str) -> Dict[str, Any]:
+        """솔루션 검증 (AZR Python Executor 사용 예정)"""
+        validation_result = {
+            'problem_id': problem['task_id'],
+            'solution': solution,
+            'syntax_valid': False,
+            'test_results': [],
+            'overall_success': False,
+            'error_message': None
+        }
+        try:
+            # 1. 구문 검증
+            compile(solution, '<string>', 'exec')
+            validation_result['syntax_valid'] = True
+            # 2. 테스트 케이스 실행 (향후 AZR Python Executor 연동)
+            test_cases = self.get_test_cases(problem)
+            validation_result['test_results'] = [
+                {'input': inp, 'expected': out, 'passed': False}
+                for inp, out in test_cases
+            ]
+            # 임시: 구문만 통과하면 성공으로 처리
+            validation_result['overall_success'] = True
+        except SyntaxError as e:
+            validation_result['error_message'] = f"Syntax Error: {e}"
+        except Exception as e:
+            validation_result['error_message'] = f"Validation Error: {e}"
+        return validation_result
+    def get_sequential_problems(self, benchmark_config: BenchmarkConfig,
+                              num_problems: int) -> List[Dict[str, Any]]:
+        """순차적으로 N개 문제 로드"""
+        problems = []
+        for i in range(num_problems):
+            problem_index = benchmark_config.start_index + i
+            problem_id = f"{benchmark_config.problem_prefix}/{problem_index}"
+            try:
+                problem = self.load_problem(benchmark_config, problem_id)
+                problems.append(problem)
+            except Exception as e:
+                self.logger.log_error(f"Failed to load {problem_id}: {e}")
+                continue
+        return problems
+    def get_problem_statistics(self, benchmark_config: BenchmarkConfig) -> Dict[str, Any]:
+        """벤치마크 통계 정보"""
+        problem_file = benchmark_config.data_path
+        if not os.path.exists(problem_file):
+            return {"error": f"File not found: {problem_file}"}
+        with open(problem_file, 'r', encoding='utf-8') as f:
+            problems = [json.loads(line) for line in f]
+        stats = {
+            'total_problems': len(problems),
+            'benchmark_name': benchmark_config.name,
+            'data_file': problem_file,
+            'sample_problem_ids': [p['task_id'] for p in problems[:5]]
+        }
+        return stats

absolute_zero_reasoner/testtime/complete_pipeline.py ADDED Viewed

The diff for this file is too large to render. See raw diff

absolute_zero_reasoner/testtime/config.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""
+TestTime RLVR Configuration
+AZR 기반 TestTime RLVR을 위한 설정 클래스
+"""
+from dataclasses import dataclass
+from typing import Optional, List, Dict, Any
+import torch
+@dataclass
+class TestTimeConfig:
+    """TestTime RLVR 전용 설정"""
+    # ============================================================================
+    # 기본 모델 설정 (AZR 기반)
+    # ============================================================================
+    model_name: str = "Qwen/Qwen2.5-7B"
+    device: str = "auto"
+    torch_dtype: torch.dtype = torch.bfloat16
+    use_flash_attention: bool = True
+    enable_gradient_checkpointing: bool = True
+    # ============================================================================
+    # TestTime 학습 설정
+    # ============================================================================
+    max_adaptation_steps: int = 10  # AZR 대비 짧은 적응 학습
+    adaptation_batch_size: int = 1  # 소규모 배치
+    gradient_accumulation_steps: int = 4
+    learning_rate: float = 1e-6  # AZR과 동일
+    # ============================================================================
+    # 반복 제어 설정
+    # ============================================================================
+    max_cycles: int = 3  # 최대 반복 횟수
+    min_improvement_threshold: float = 0.05  # 최소 개선 임계값
+    early_stopping_patience: int = 2  # Early stopping
+    # ============================================================================
+    # IPO 추출 설정
+    # ============================================================================
+    max_ipo_triples: int = 10  # 추출할 최대 트리플 수
+    python_executor_timeout: int = 5  # AZR보다 짧은 타임아웃
+    validate_triples: bool = True  # 트리플 검증 여부
+    # ============================================================================
+    # 다중 프로그램 생성 설정
+    # ============================================================================
+    num_program_variations: int = 4  # 생성할 다양한 프로그램 수
+    baseline_evaluation_rounds: int = 5  # 베이스라인 성능 측정 횟수
+    diverse_generation_temperature: float = 0.7  # 다양한 프로그램 생성용 temperature
+    baseline_generation_temperature: float = 0.05  # 베이스라인 측정용 temperature
+    # ============================================================================
+    # 태스크 생성 설정
+    # ============================================================================
+    task_distribution: Dict[str, float] = None  # induction:deduction:abduction 비율
+    max_tasks_per_type: int = 5  # 타입별 최대 태스크 수
+    use_azr_templates: bool = True  # AZR 템플릿 사용
+    skip_task_evaluation: bool = True  # Task evaluation(4단계) 스킵 여부 (VeRL에서 수행)
+    # ============================================================================
+    # 보상 설정 (AZR 기반)
+    # ============================================================================
+    use_accuracy_reward: bool = True
+    use_improvement_reward: bool = True  # TestTime 전용 개선도 보상
+    use_complexity_reward: bool = True
+    accuracy_weight: float = 1.0
+    improvement_weight: float = 0.5  # 개선도 가중치
+    complexity_weight: float = 0.1
+    # ============================================================================
+    # 로깅 설정
+    # ============================================================================
+    log_level: str = "INFO"
+    save_intermediate_results: bool = True
+    log_ipo_details: bool = True
+    log_task_details: bool = True
+    log_training_metrics: bool = True
+    # ============================================================================
+    # 메모리 최적화 설정 (AZR 기반)
+    # ============================================================================
+    gpu_memory_utilization: float = 0.4
+    max_workers: int = 2  # Python executor workers
+    use_memory_efficient_attention: bool = True
+    def __post_init__(self):
+        """설정 후처리"""
+        if self.task_distribution is None:
+            # 기본 태스크 분포: 균등 분배
+            self.task_distribution = {
+                "induction": 0.33,
+                "deduction": 0.33,
+                "abduction": 0.34
+            }
+        # device 자동 설정
+        if self.device == "auto":
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # dtype 설정
+        if self.device == "cpu":
+            self.torch_dtype = torch.float32
+    def to_dict(self) -> Dict[str, Any]:
+        """설정을 딕셔너리로 변환"""
+        return {
+            "model_name": self.model_name,
+            "device": self.device,
+            "torch_dtype": str(self.torch_dtype),
+            "max_adaptation_steps": self.max_adaptation_steps,
+            "max_cycles": self.max_cycles,
+            "learning_rate": self.learning_rate,
+            "task_distribution": self.task_distribution,
+            "reward_weights": {
+                "accuracy": self.accuracy_weight,
+                "improvement": self.improvement_weight,
+                "complexity": self.complexity_weight
+            }
+        }
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any]) -> 'TestTimeConfig':
+        """딕셔너리에서 설정 로드"""
+        return cls(**config_dict)
+@dataclass
+class BenchmarkConfig:
+    """벤치마크별 설정"""
+    name: str  # "humaneval", "mbpp", "livecodebase"
+    data_path: str
+    problem_prefix: str  # "HumanEval", "Mbpp"
+    start_index: int = 0  # MBPP는 2부터 시작
+    max_problems: int = 5  # 테스트할 문제 수
+    # 벤치마크별 특화 설정
+    test_timeout: int = 10
+    use_plus_version: bool = True  # HumanEval+, MBPP+ 사용
+    @classmethod
+    def get_humaneval_config(cls) -> 'BenchmarkConfig':
+        return cls(
+            name="humaneval",
+            data_path="evaluation/code_eval/data/HumanEvalPlus.jsonl",
+            problem_prefix="HumanEval",
+            start_index=0,
+            max_problems=5
+        )
+    @classmethod
+    def get_mbpp_config(cls) -> 'BenchmarkConfig':
+        return cls(
+            name="mbpp",
+            data_path="evaluation/code_eval/data/MbppPlus.jsonl",
+            problem_prefix="Mbpp",
+            start_index=2,  # MBPP는 2번부터
+            max_problems=5
+        )

absolute_zero_reasoner/testtime/ipo_extractor.py ADDED Viewed

	@@ -0,0 +1,1235 @@

+"""
+IPO Triple Extractor
+AZR Python Executor 기반 (Input, Program, Output) 트리플 추출 시스템
+요구사항 2: "AZR Python Executor를 이용하여 (i,p,o) pair를 만든다"
+"""
+import ast
+import re
+import json
+from typing import Dict, List, Any, Tuple, Optional
+from concurrent.futures import TimeoutError
+from ..utils.code_utils.python_executor import PythonExecutor
+from .config import TestTimeConfig
+from .logger import TestTimeLogger
+from .solution_generator import InitialSolutionGenerator
+class IPOBuffer:
+    """IPO triple을 저장하고 관리하는 버퍼"""
+    def __init__(self):
+        self.buffer = {}  # {problem_id: [ipo_triples]}
+    def add(self, problem_id: str, ipo_triple: Dict[str, Any]):
+        """IPO triple을 버퍼에 추가"""
+        if problem_id not in self.buffer:
+            self.buffer[problem_id] = []
+        self.buffer[problem_id].append(ipo_triple)
+    def get_all(self, problem_id: str) -> List[Dict[str, Any]]:
+        """특정 문제의 모든 IPO triple 반환"""
+        return self.buffer.get(problem_id, [])
+    def clear(self, problem_id: str = None):
+        """버퍼 초기화"""
+        if problem_id:
+            self.buffer.pop(problem_id, None)
+        else:
+            self.buffer.clear()
+    def size(self, problem_id: str = None) -> int:
+        """버퍼 크기 반환"""
+        if problem_id:
+            return len(self.buffer.get(problem_id, []))
+        return sum(len(triples) for triples in self.buffer.values())
+class IPOTripleExtractor:
+    """(Input, Program, Output) 트리플 추출 및 검증"""
+    def __init__(self, config: TestTimeConfig, logger: Optional[TestTimeLogger] = None,
+                 model=None, tokenizer=None):
+        self.config = config
+        self.logger = logger or TestTimeLogger()
+        self.model = model
+        self.tokenizer = tokenizer
+        # AZR Python Executor 초기화 (기존 방식)
+        self.executor = PythonExecutor(
+            timeout_length=config.python_executor_timeout,
+            ast_check=True,  # AZR 기본 설정
+            max_workers=config.max_workers
+        )
+        self.extracted_triples = []
+        # 입력 생성 프롬프트와 응답 저장용
+        self.last_generation_prompt = ""
+        self.last_generation_response = ""
+        # VLLM 배치 처리를 위한 참조
+        self.solution_generator = None
+    def extract_triples(self, problem: Dict[str, Any], solution: str) -> List[Dict[str, Any]]:
+        """벤치마크 문제와 솔루션에서 IPO 트리플 추출"""
+        problem_id = problem.get('task_id', 'unknown')
+        self.logger.log_info(f"🔍 Extracting IPO triples for {problem_id}")
+        triples = []
+        try:
+            # 1. 함수 정보 추출 (entry point 우선)
+            entry_point = problem.get('entry_point', 'unknown')
+            func_info = self._extract_function_info(solution, entry_point)
+            if not func_info:
+                self.logger.log_error(f"Failed to extract function info from solution")
+                return []
+            # 2. 테스트 케이스에서 입력-출력 쌍 생성 (LLM 솔루션 기반)
+            test_cases = self._extract_test_cases(problem, solution)
+            # 3. 솔루션 실행으로 IPO 트리플 생성
+            for i, (test_input_str, expected_output) in enumerate(test_cases):
+                if len(triples) >= self.config.max_ipo_triples:
+                    break
+                # test_input_str에서 실제 인자 추출 (예: "strlen('')" -> "''")
+                import re
+                match = re.match(rf'{entry_point}\((.*)\)', test_input_str)
+                if match:
+                    actual_args = match.group(1)
+                else:
+                    actual_args = test_input_str  # fallback
+                triple = self._create_ipo_triple(
+                    func_info['full_code'],  # 🔧 수정: 전체 코드 사용 (도우미 함수 포함)
+                    func_info,
+                    actual_args,  # 실제 인자만 전달
+                    expected_output,
+                    triple_id=f"{problem_id}_triple_{i}",
+                    full_input_str=test_input_str  # 전체 입력 문자열도 전달
+                )
+                if triple:
+                    triples.append(triple)
+            # 🔧 수정: Synthetic 트리플 생성 제거 (단일 예시만 사용하여 치팅 방지)
+            # Synthetic 트리플 생성 로직을 제거하여 진짜 단일 예시만 사용
+            # 검증 및 로깅
+            validation_results = [self._validate_triple(triple) for triple in triples]
+            self.logger.log_ipo_extraction(problem_id, triples, validation_results)
+            # 유효한 트리플만 반환
+            valid_triples = [triple for triple, valid in zip(triples, validation_results) if valid]
+            self.logger.log_info(f"✅ Extracted {len(valid_triples)}/{len(triples)} valid IPO triples")
+            return valid_triples
+        except Exception as e:
+            self.logger.log_error(f"IPO extraction failed: {e}")
+            return []
+    def _extract_function_info(self, solution: str, entry_point: str = None) -> Optional[Dict[str, str]]:
+        """솔루션에서 함수 정보 추출 (entry point 우선)"""
+        try:
+            # 🔧 개선: Raw LLM response인지 확인하고 함수 코드 추출
+            processed_solution = solution
+            if "LLM GENERATED SOLUTION:" in solution:
+                self.logger.log_info("📝 Raw LLM response detected, extracting function code")
+                processed_solution = self._extract_function_from_llm_response(solution)
+                if not processed_solution:
+                    self.logger.log_error("Failed to extract function from LLM response")
+                    return None
+            # AST로 함수 정의 파싱
+            tree = ast.parse(processed_solution)
+            # 🔧 수정: Entry point 함수 우선 검색
+            target_function = None
+            all_functions = []
+            for node in ast.walk(tree):
+                if isinstance(node, ast.FunctionDef):
+                    func_info = {
+                        'name': node.name,
+                        'args': [arg.arg for arg in node.args.args],
+                        'signature': f"def {node.name}({', '.join([arg.arg for arg in node.args.args])}):",
+                        'full_code': processed_solution
+                    }
+                    all_functions.append(func_info)
+                    # Entry point와 일치하는 함수 우선 선택
+                    if entry_point and node.name == entry_point:
+                        target_function = func_info
+                        # 이 로그는 너무 자주 출력되므로 debug 레벨로 변경
+                        self.logger.log_debug(f"🎯 Found entry point function: {entry_point}")
+                        break
+            # Entry point 함수를 찾았으면 반환
+            if target_function:
+                return target_function
+            # Entry point를 찾지 못했으면 첫 번째 함수 반환 (기존 방식)
+            if all_functions:
+                self.logger.log_warning(f"⚠️  Entry point '{entry_point}' not found, using first function: {all_functions[0]['name']}")
+                return all_functions[0]
+            return None
+        except Exception as e:
+            self.logger.log_error(f"Function parsing failed: {e}")
+            return None
+    def _extract_function_from_llm_response(self, llm_response: str) -> str:
+        """Raw LLM response에서 함수 코드 추출 (solution_generator와 동일한 로직)"""
+        lines = llm_response.split('\n')
+        solution_lines = []
+        in_solution = False
+        # "LLM GENERATED SOLUTION:" 섹션 추출 (수정된 로직)
+        for i, line in enumerate(lines):
+            if "LLM GENERATED SOLUTION:" in line:
+                in_solution = True
+                continue
+            elif in_solution:
+                # "===============" 라인이 나오면 종료하되, 첫 번째 "==============="는 건너뛰기
+                if "===============" in line:
+                    # 실제 솔루션 라인들이 있는지 확인
+                    if solution_lines and any(l.strip() for l in solution_lines):
+                        break
+                    else:
+                        # 아직 솔루션 라인이 없으면 계속 진행 (첫 번째 구분선 건너뛰기)
+                        continue
+                solution_lines.append(line)
+        if not solution_lines:
+            return ""  # 추출 실패시 빈 문자열 반환
+        extracted_solution = '\n'.join(solution_lines).strip()
+        # 함수 정의와 import 추출 (solution_generator 로직과 동일)
+        lines = extracted_solution.split('\n')
+        import_lines = []
+        func_lines = []
+        in_function = False
+        indent_level = 0
+        # 1. import 문 수집
+        for line in lines:
+            stripped = line.strip()
+            if (stripped.startswith('import ') or stripped.startswith('from ')) and not stripped.startswith('#'):
+                import_lines.append(line)
+        # 2. 함수 정의 찾기
+        for line in lines:
+            if line.strip().startswith('def '):
+                in_function = True
+                func_lines = [line]
+                indent_level = len(line) - len(line.lstrip())
+            elif in_function:
+                if not line.strip() or (line.strip() and len(line) - len(line.lstrip()) > indent_level):
+                    func_lines.append(line)
+                else:
+                    break
+        # 3. import + function 결합
+        if func_lines:
+            result_lines = import_lines + [''] + func_lines if import_lines else func_lines
+            return '\n'.join(result_lines)
+        else:
+            return extracted_solution
+    def _fix_humaneval_canonical_solution(self, problem: Dict[str, Any]) -> str:
+        """HumanEval canonical solution 복원 (함수 시그니처 추가)"""
+        canonical_code = problem.get('canonical_solution', '')
+        entry_point = problem.get('entry_point', '')
+        prompt = problem.get('prompt', '')
+        # HumanEval인지 확인
+        task_id = problem.get('task_id', '')
+        if not task_id.startswith('HumanEval/'):
+            return canonical_code
+        # 이미 함수 시그니처가 있는지 확인
+        if f"def {entry_point}" in canonical_code:
+            return canonical_code
+        try:
+            # Prompt에서 함수 시그니처 추출
+            import re
+            def_pattern = rf'def\s+{re.escape(entry_point)}\s*\([^)]*\)[^:]*:'
+            match = re.search(def_pattern, prompt, re.MULTILINE)
+            if match:
+                function_signature = match.group(0)
+                # Import 문도 추출 (있다면)
+                import_lines = []
+                for line in prompt.split('\n'):
+                    stripped = line.strip()
+                    if (stripped.startswith('import ') or stripped.startswith('from ')) and not stripped.startswith('#'):
+                        import_lines.append(line)
+                # 완전한 canonical solution 구성
+                if import_lines:
+                    complete_canonical = '\n'.join(import_lines) + '\n\n' + function_signature + canonical_code
+                else:
+                    complete_canonical = function_signature + canonical_code
+                self.logger.log_info(f"🔧 Fixed HumanEval canonical solution for {entry_point}")
+                return complete_canonical
+            else:
+                self.logger.log_warning(f"⚠️  Could not extract function signature for {entry_point}")
+                return canonical_code
+        except Exception as e:
+            self.logger.log_error(f"Failed to fix HumanEval canonical solution: {e}")
+            return canonical_code
+    def _extract_single_prompt_example(self, problem: Dict[str, Any]) -> Optional[Tuple[str, str]]:
+        """🔧 새로운 메서드: 프롬프트의 단일 예시만 추출 (치팅 방지)"""
+        try:
+            # base_input의 첫 번째 항목을 단일 예시로 사용
+            if 'base_input' in problem and problem['base_input']:
+                first_input = problem['base_input'][0]
+                entry_point = problem['entry_point']
+                self.logger.log_info(f"📥 Using first base_input as single example: {first_input}")
+                # 🔧 수정: HumanEval canonical solution 복원
+                canonical_code = self._fix_humaneval_canonical_solution(problem)
+                if canonical_code:
+                    actual_output = self._execute_llm_solution(canonical_code, entry_point, first_input)
+                    if actual_output is not None:
+                        # 입력 문자열 형식 생성
+                        if isinstance(first_input, list):
+                            if len(first_input) == 1 and isinstance(first_input[0], list):
+                                # [[args]] -> 단일 리스트 인자로 표시
+                                input_str = repr(first_input[0])
+                            elif len(first_input) == 1:
+                                # [단일인자] -> 단일인자
+                                input_str = repr(first_input[0])
+                            else:
+                                # [다중인자] -> 다중인자
+                                input_str = ', '.join(repr(arg) for arg in first_input)
+                        else:
+                            input_str = repr(first_input)
+                        result = (input_str, str(actual_output))
+                        self.logger.log_info(f"✅ Single example extracted: Input={input_str}, Output={actual_output}")
+                        return result
+                    else:
+                        self.logger.log_warning("❌ Failed to compute output with canonical solution")
+                else:
+                    self.logger.log_warning("❌ No canonical solution available")
+            else:
+                self.logger.log_warning("❌ No base_input available")
+        except Exception as e:
+            self.logger.log_error(f"Single example extraction failed: {e}")
+        return None
+    def _extract_docstring_examples(self, prompt: str, func_name: str) -> List[Tuple[str, str]]:
+        """docstring에서 >>> 예제 추출"""
+        examples = []
+        lines = prompt.split('\n')
+        i = 0
+        while i < len(lines):
+            line = lines[i].strip()
+            # >>> func_name(...) 패턴 찾기
+            if line.startswith('>>>') and func_name in line:
+                # 입력 추출
+                input_line = line[3:].strip()  # >>> 제거
+                # 다음 줄에서 출력 추출
+                if i + 1 < len(lines):
+                    output_line = lines[i + 1].strip()
+                    # 출력이 >>> 로 시작하지 않으면 출력값
+                    if not output_line.startswith('>>>'):
+                        examples.append((input_line, output_line))
+                        i += 2
+                        continue
+                i += 1
+            else:
+                i += 1
+        return examples
+    def _extract_test_cases(self, problem: Dict[str, Any], solution: str) -> List[Tuple[str, str]]:
+        """docstring의 예제에서 테스트 케이스 추출 (치팅 방지)"""
+        test_cases = []
+        func_name = problem.get('entry_point', 'unknown')
+        problem_id = problem.get('task_id', '')
+        # HumanEval과 MBPP 모두 docstring 예제만 사용
+        self.logger.log_info(f"🎯 Extracting docstring examples for {problem_id}")
+        # 프롬프트에서 docstring 예제 추출
+        prompt = problem.get('prompt', '')
+        examples = self._extract_docstring_examples(prompt, func_name)
+        if examples:
+            self.logger.log_info(f"📝 Found {len(examples)} docstring examples")
+            for i, (input_str, expected_output) in enumerate(examples):
+                try:
+                    # 입력 파싱 (func_name(args) 형태에서 args 추출)
+                    import ast
+                    # "func_name(args)" -> args 추출
+                    if input_str.startswith(func_name + '(') and input_str.endswith(')'):
+                        args_str = input_str[len(func_name)+1:-1]
+                        # 안전한 평가를 위해 ast.literal_eval 사용
+                        try:
+                            # 단일 인자인 경우
+                            input_args = ast.literal_eval(args_str)
+                            if not isinstance(input_args, tuple):
+                                input_args = (input_args,)
+                        except:
+                            # 여러 인자인 경우
+                            input_args = ast.literal_eval(f"({args_str})")
+                        # LLM 솔루션 실행
+                        actual_output = self._execute_llm_solution(solution, func_name, list(input_args))
+                        if actual_output is not None:
+                            test_cases.append((input_str, str(actual_output)))
+                            self.logger.log_info(f"✅ Example {i+1}: {input_str} -> {actual_output}")
+                        else:
+                            self.logger.log_warning(f"❌ Example {i+1} execution failed")
+                except Exception as e:
+                    self.logger.log_error(f"Example {i+1} parsing failed: {e}")
+        else:
+            self.logger.log_warning(f"⚠️ No docstring examples found, falling back to first base_input")
+            # docstring 예제가 없으면 첫 번째 base_input만 사용 (MBPP처럼)
+            if 'base_input' in problem and problem['base_input']:
+                inp_args = problem['base_input'][0]
+                # 입력 문자열 생성
+                if isinstance(inp_args, list):
+                    args_str = ', '.join(repr(arg) for arg in inp_args)
+                    input_str = f"{func_name}({args_str})"
+                else:
+                    input_str = f"{func_name}({repr(inp_args)})"
+                actual_output = self._execute_llm_solution(solution, func_name, inp_args)
+                if actual_output is not None:
+                    test_cases.append((input_str, str(actual_output)))
+        self.logger.log_info(f"📊 Extracted {len(test_cases)} test cases from docstring examples")
+        return test_cases
+    def _execute_llm_solution(self, llm_solution: str, func_name: str, input_args) -> Optional[str]:
+        """LLM 생성 솔루션을 실행하여 실제 출력 계산"""
+        try:
+            if not llm_solution or func_name == 'unknown':
+                return None
+            # 🔧 수정: 실행용 코드 구성 (MBPP+ 이중 리스트 처리)
+            if isinstance(input_args, list):
+                # MBPP+ 데이터가 이중 리스트로 감싸진 경우 처리
+                if len(input_args) == 1 and isinstance(input_args[0], list):
+                    # [[args]] -> 단일 리스트 인자로 전달
+                    args_str = repr(input_args[0])
+                elif len(input_args) == 1:
+                    # [단일인자] -> 단일 인자로 전달
+                    args_str = repr(input_args[0])
+                else:
+                    # [다중인자] -> 다중 인자로 전달
+                    args_str = ', '.join(repr(arg) for arg in input_args)
+            else:
+                args_str = repr(input_args)
+            execution_code = f"""
+{llm_solution}
+# Execute LLM solution
+try:
+    result = {func_name}({args_str})
+    print(repr(result))
+except Exception as e:
+    print(f"EXECUTION_ERROR: {{e}}")
+"""
+            # AZR Python Executor로 실행
+            output, status = self.executor.apply(execution_code)
+            if 'error' in status.lower() or 'EXECUTION_ERROR' in output:
+                return None
+            # 출력에서 결과 추출
+            output_lines = output.strip().split('\n')
+            if output_lines:
+                result_line = output_lines[-1].strip()
+                # repr()로 출력된 결과를 그대로 반환
+                return result_line
+            return None
+        except Exception as e:
+            self.logger.log_error(f"LLM solution execution failed: {e}")
+            return None
+    def _create_ipo_triple(self, solution: str, func_info: Dict[str, str],
+                          test_input: str, expected_output: str,
+                          triple_id: str, full_input_str: str = None) -> Optional[Dict[str, Any]]:
+        """IPO 트리플 생성 및 검증 (AZR Python Executor 사용)"""
+        try:
+            # 1. 솔루션 실행으로 실제 출력 확인
+            actual_output = self._execute_function(solution, func_info['name'], test_input)
+            if actual_output is None:
+                return None
+            # 2. IPO 트리플 구성
+            triple = {
+                'id': triple_id,
+                'input': test_input,  # 실제 인자만 저장 (예: "''", "3.5")
+                'full_input_str': full_input_str or f"{func_info['name']}({test_input})",  # 전체 입력 문자열은 별도 필드에
+                'program': solution,  # 이미 func_info['full_code']가 전달됨
+                'expected_output': expected_output,
+                'actual_output': actual_output,
+                'function_name': func_info['name'],
+                'function_args': func_info['args'],
+                'is_correct': str(actual_output) == str(expected_output),
+                'extraction_method': 'test_case'
+            }
+            return triple
+        except Exception as e:
+            self.logger.log_error(f"Triple creation failed for {triple_id}: {e}")
+            return None
+    def _execute_function(self, code: str, func_name: str, inputs: str) -> Optional[str]:
+        """AZR Python Executor로 함수 실행"""
+        try:
+            # 실행용 코드 구성 (AZR 템플릿 스타일)
+            execution_code = f"""
+{code}
+# Execute function with inputs
+try:
+    result = {func_name}({inputs})
+    print(repr(result))
+except Exception as e:
+    print(f"EXECUTION_ERROR: {{e}}")
+"""
+            # AZR 방식으로 실행
+            output, status = self.executor.apply(execution_code)
+            if 'error' in status.lower() or 'EXECUTION_ERROR' in output:
+                return None
+            # 출력에서 결과 추출
+            output_lines = output.strip().split('\n')
+            if output_lines:
+                return output_lines[-1].strip()
+            return None
+        except Exception as e:
+            self.logger.log_error(f"Function execution failed: {e}")
+            return None
+    # 🔧 제거: Synthetic 트리플 생성 메서드들 제거
+    # 단일 예시만 사용하여 치팅 방지 목적에 맞게 불필요한 메서드들 제거
+    def _validate_triple(self, triple: Dict[str, Any]) -> bool:
+        """IPO 트리플 검증"""
+        if not self.config.validate_triples:
+            return True
+        try:
+            # 1. 기본 필드 존재 확인
+            required_fields = ['input', 'program', 'expected_output', 'function_name']
+            if not all(field in triple for field in required_fields):
+                return False
+            # 2. 코드 구문 검증
+            try:
+                ast.parse(triple['program'])
+            except SyntaxError:
+                return False
+            # 3. 재실행으로 일관성 검증 (AZR 방식)
+            # 이제 triple['input']은 이미 실제 인자만 포함
+            actual_output = self._execute_function(
+                triple['program'],
+                triple['function_name'],
+                triple['input']
+            )
+            if actual_output is None:
+                return False
+            # 4. 출력 일치 확인
+            return str(actual_output) == str(triple['expected_output'])
+        except Exception as e:
+            self.logger.log_error(f"Triple validation failed: {e}")
+            return False
+    def get_triple_statistics(self) -> Dict[str, Any]:
+        """추출된 트리플 통계"""
+        if not self.extracted_triples:
+            return {"total": 0, "valid": 0, "invalid": 0}
+        valid_count = sum(1 for triple in self.extracted_triples if triple.get('is_correct', False))
+        return {
+            "total": len(self.extracted_triples),
+            "valid": valid_count,
+            "invalid": len(self.extracted_triples) - valid_count,
+            "extraction_methods": {
+                "test_case": sum(1 for t in self.extracted_triples if t.get('extraction_method') == 'test_case'),
+                "synthetic": sum(1 for t in self.extracted_triples if t.get('extraction_method') == 'synthetic')
+            }
+        }
+    def generate_diverse_inputs(self, problem: Dict[str, Any], solution: str,
+                              existing_examples: List[Tuple[str, str]]) -> List[Dict[str, Any]]:
+        """LLM을 사용하여 다양한 입력 생성"""
+        problem_id = problem.get('task_id', 'unknown')
+        self.logger.log_info(f"🎲 Generating diverse inputs for {problem_id}")
+        try:
+            # 1. 함수 정보 추출
+            entry_point = problem.get('entry_point', 'unknown')
+            func_info = self._extract_function_info(solution, entry_point)
+            if not func_info:
+                self.logger.log_error("Failed to extract function info for input generation")
+                return []
+            # 2. 인자 타입 정보 추론
+            arg_type_info = self._infer_argument_types(func_info, existing_examples, solution)
+            # 3. 프롬프트 생성
+            prompt = self._create_input_generation_prompt(
+                problem_description=problem.get('prompt', ''),
+                existing_examples=existing_examples,
+                full_code=solution,
+                arg_type_info=arg_type_info
+            )
+            # 4. LLM으로 입력 생성
+            generated_inputs = self._call_llm_for_inputs(prompt, existing_examples, func_info, arg_type_info)
+            # 5. 생성된 입력 검증
+            valid_inputs = self._validate_generated_inputs(generated_inputs, func_info, solution)
+            self.logger.log_info(f"✅ Generated {len(valid_inputs)} valid diverse inputs")
+            return valid_inputs
+        except Exception as e:
+            self.logger.log_error(f"Failed to generate diverse inputs: {e}")
+            return []
+    def generate_diverse_inputs_batch(self, program_input_pairs: List[Dict[str, Any]]) -> Tuple[List[List[Dict[str, Any]]], List[Optional[Dict[str, Any]]]]:
+        """배치로 여러 프로그램의 diverse input 생성"""
+        if not self.solution_generator:
+            self.logger.log_error("Solution generator not set for batch processing")
+            return [], []
+        self.logger.log_info(f"🎲 Generating diverse inputs for {len(program_input_pairs)} programs (BATCH)")
+        try:
+            # 모든 프로그램의 입력 생성 프롬프트 생성
+            batch_prompts = []
+            program_contexts = []
+            for pair in program_input_pairs:
+                problem = pair['problem']
+                solution = pair['solution']
+                existing_examples = pair['existing_examples']
+                # 함수 정보 추출
+                entry_point = problem.get('entry_point', 'unknown')
+                func_info = self._extract_function_info(solution, entry_point)
+                if not func_info:
+                    program_contexts.append(None)
+                    batch_prompts.append("")
+                    continue
+                # 인자 타입 정보 추론
+                arg_type_info = self._infer_argument_types(func_info, existing_examples, solution)
+                # 프롬프트 생성
+                prompt = self._create_input_generation_prompt(
+                    problem_description=problem.get('prompt', ''),
+                    existing_examples=existing_examples,
+                    full_code=solution,
+                    arg_type_info=arg_type_info
+                )
+                batch_prompts.append(prompt)
+                program_contexts.append({
+                    'func_info': func_info,
+                    'solution': solution,
+                    'problem': problem
+                })
+            # VLLM 배치로 LLM 호출
+            if not batch_prompts or all(not p for p in batch_prompts):
+                return [], []
+            self.logger.log_info(f"🔍 Sending {len(batch_prompts)} prompts to VLLM for input generation")
+            self.logger.log_info(f"🔍 First prompt preview: {batch_prompts[0][:200]}..." if batch_prompts else "No prompts")
+            # Input generation은 코드 생성이 아니므로 후처리 없이 원시 응답 사용
+            # generate_batch의 후처리(함수 추출 등)는 input generation에 부적합
+            batch_responses = self.solution_generator._generate_batch_with_vllm(
+                batch_prompts,
+                temperature=0.7  # Input generation에는 약간의 랜덤성 필요
+            )
+            self.logger.log_info(f"🔍 Received {len(batch_responses)} responses from VLLM")
+            for i, response in enumerate(batch_responses[:2]):  # 처음 2개만 로깅
+                self.logger.log_info(f"🔍 Response {i} preview: {response[:200]}...")
+            # 각 응답을 파싱하여 입력 생성
+            batch_results = []
+            batch_generation_info = []  # 각 프로그램의 input generation 정보 저장
+            for i, (response, context) in enumerate(zip(batch_responses, program_contexts)):
+                if context is None:
+                    batch_results.append([])
+                    batch_generation_info.append(None)
+                    continue
+                try:
+                    # 응답에서 입력 추출
+                    generated_inputs = self._parse_llm_input_response(
+                        response,
+                        context['func_info'],
+                        context['problem'].get('task_id', 'unknown')
+                    )
+                    # 디버깅: 파싱된 입력 개수 로깅
+                    self.logger.log_info(f"🔍 Parsed {len(generated_inputs)} inputs from response {i}")
+                    if generated_inputs:
+                        self.logger.log_info(f"🔍 First parsed input: {generated_inputs[0]}")
+                    # 생성된 입력 검증
+                    valid_inputs = self._validate_generated_inputs(
+                        generated_inputs,
+                        context['func_info'],
+                        context['solution']
+                    )
+                    # 디버깅: 검증 후 입력 개수 로깅
+                    self.logger.log_info(f"🔍 {len(valid_inputs)} inputs passed validation from response {i}")
+                    batch_results.append(valid_inputs)
+                    # Input generation 정보 저장
+                    generation_info = {
+                        'prompt': batch_prompts[i] if i < len(batch_prompts) else '',
+                        'llm_response': response,
+                        'extracted_inputs': generated_inputs,
+                        'valid_inputs': valid_inputs,
+                        'existing_examples': program_input_pairs[i]['existing_examples'] if i < len(program_input_pairs) else [],
+                        'function_info': context['func_info'],
+                        'arg_type_info': self._infer_argument_types(
+                            context['func_info'],
+                            program_input_pairs[i]['existing_examples'] if i < len(program_input_pairs) else [],
+                            context['solution']
+                        )
+                    }
+                    batch_generation_info.append(generation_info)
+                except Exception as e:
+                    self.logger.log_error(f"Failed to process batch item {i}: {e}")
+                    # 더 자세한 디버깅 정보 추가
+                    self.logger.log_error(f"Response preview: {response[:200]}...")
+                    import traceback
+                    self.logger.log_error(f"Traceback: {traceback.format_exc()}")
+                    batch_results.append([])
+                    # 에러 정보도 저장
+                    batch_generation_info.append({
+                        'error': str(e),
+                        'prompt': batch_prompts[i] if i < len(batch_prompts) else '',
+                        'llm_response': response,
+                        'traceback': traceback.format_exc()
+                    })
+            total_generated = sum(len(inputs) for inputs in batch_results)
+            self.logger.log_info(f"✅ Generated {total_generated} diverse inputs across {len(program_input_pairs)} programs")
+            # Return both inputs and generation info as a tuple
+            return batch_results, batch_generation_info
+        except Exception as e:
+            self.logger.log_error(f"Batch input generation failed: {e}")
+            return [], []
+    def _parse_llm_input_response(self, llm_response: str, func_info: Dict[str, Any], problem_id: str) -> List[Dict[str, Any]]:
+        """LLM 응답에서 입력 예제 파싱"""
+        self.logger.log_info(f"🔍 Parsing LLM response for {problem_id}, response length: {len(llm_response)}")
+        try:
+            # ```python ... ``` 블록에서 코드 추출
+            import re
+            code_pattern = r'```python\n(.*?)\n```'
+            matches = re.findall(code_pattern, llm_response, re.DOTALL)
+            if not matches:
+                self.logger.log_info("🔍 No code block found, searching for examples = [")
+                # 블록이 없으면 전체 응답에서 examples = 찾기
+                if 'examples = [' in llm_response:
+                    start = llm_response.find('examples = [')
+                    # 균형잡힌 괄호 찾기
+                    bracket_count = 0
+                    end = start
+                    for i, char in enumerate(llm_response[start:]):
+                        if char == '[':
+                            bracket_count += 1
+                        elif char == ']':
+                            bracket_count -= 1
+                            if bracket_count == 0:
+                                end = start + i + 1
+                                break
+                    if end > start:
+                        code = llm_response[start:end]
+                        self.logger.log_info(f"🔍 Found examples code: {code[:100]}...")
+                        exec_globals = {}
+                        exec(code, exec_globals)
+                        examples = exec_globals.get('examples', [])
+                        self.logger.log_info(f"🔍 Extracted {len(examples)} examples")
+                        return examples
+                else:
+                    self.logger.log_info("🔍 No 'examples = [' found in response")
+            else:
+                # 코드 블록에서 examples 추출
+                self.logger.log_info(f"🔍 Found {len(matches)} code blocks")
+                code = matches[0]
+                self.logger.log_info(f"🔍 Code block preview: {code[:100]}...")
+                exec_globals = {}
+                exec(code, exec_globals)
+                examples = exec_globals.get('examples', [])
+                self.logger.log_info(f"🔍 Extracted {len(examples)} examples from code block")
+                # examples가 dict가 아닌 경우 처리
+                if examples and len(examples) > 0:
+                    self.logger.log_info(f"🔍 First example type: {type(examples[0])}")
+                    if isinstance(examples[0], dict):
+                        # expected_output, description 등 불필요한 키 제거
+                        cleaned_examples = []
+                        for ex in examples:
+                            cleaned = {k: v for k, v in ex.items()
+                                     if k not in ['expected_output', 'description']}
+                            if cleaned:  # 빈 dict가 아닌 경우만 추가
+                                cleaned_examples.append(cleaned)
+                        self.logger.log_info(f"🔍 Cleaned {len(cleaned_examples)} examples")
+                        return cleaned_examples
+                return examples
+            return []
+        except Exception as e:
+            self.logger.log_error(f"Failed to parse generated examples for {problem_id}: {e}")
+            import traceback
+            self.logger.log_error(f"Traceback: {traceback.format_exc()}")
+            return []
+    def _infer_argument_types(self, func_info: Dict[str, str],
+                            examples: List[Tuple[str, str]],
+                            solution: str) -> Dict[str, str]:
+        """기존 예제와 AST 분석으로 인자 타입 추론"""
+        arg_types = {}
+        func_name = func_info['name']
+        arg_names = func_info['args']
+        # 1. AST에서 type annotation 추출
+        try:
+            tree = ast.parse(solution)
+            for node in ast.walk(tree):
+                if isinstance(node, ast.FunctionDef) and node.name == func_name:
+                    for i, arg in enumerate(node.args.args):
+                        if i < len(arg_names) and arg.annotation:
+                            # Type annotation이 있는 경우
+                            arg_types[arg_names[i]] = ast.unparse(arg.annotation)
+        except:
+            pass
+        # 2. 기존 예제에서 타입 추론
+        if examples:
+            for input_str, _ in examples:
+                # "func_name(args)" 형태에서 args 추출
+                if input_str.startswith(func_name + '(') and input_str.endswith(')'):
+                    args_str = input_str[len(func_name)+1:-1]
+                    try:
+                        # 인자 파싱
+                        parsed_args = eval(f"({args_str},)")
+                        if not isinstance(parsed_args, tuple):
+                            parsed_args = (parsed_args,)
+                        # 각 인자의 타입 추론
+                        for i, arg_value in enumerate(parsed_args):
+                            if i < len(arg_names):
+                                arg_name = arg_names[i]
+                                arg_type = type(arg_value).__name__
+                                # 특별한 케이스 처리
+                                if isinstance(arg_value, list):
+                                    if arg_value and all(isinstance(x, type(arg_value[0])) for x in arg_value):
+                                        inner_type = type(arg_value[0]).__name__
+                                        arg_type = f"List[{inner_type}]"
+                                    else:
+                                        arg_type = "List"
+                                # 기존 타입과 병합
+                                if arg_name not in arg_types:
+                                    arg_types[arg_name] = arg_type
+                    except:
+                        pass
+        # 3. 타입 정보 딕셔너리로 반환
+        # arg_types가 비어있으면 unknown 타입으로 채우기
+        for arg_name in arg_names:
+            if arg_name not in arg_types:
+                arg_types[arg_name] = "Any (type unknown)"
+        return arg_types
+    def _create_input_generation_prompt(self, problem_description: str,
+                                      existing_examples: List[Tuple[str, str]],
+                                      full_code: str,
+                                      arg_type_info: Dict[str, str]) -> str:
+        """입력 생성을 위한 프롬프트 생성"""
+        # 모든 기존 예제를 포맷팅
+        examples_text = ""
+        for i, (input_str, output_str) in enumerate(existing_examples):
+            examples_text += f"Example {i+1}:\n"
+            examples_text += f"Input: {input_str}\n"
+            examples_text += f"Output: {output_str}\n\n"
+        # arg_type_info를 문자열로 포맷팅
+        arg_type_text = "Argument types:\n"
+        for arg, arg_type in arg_type_info.items():
+            arg_type_text += f"- {arg}: {arg_type}\n"
+        prompt = f"""Given the following problem description and its Python function implementation, first analyze the types and valid ranges of the function arguments, then write **5 different example inputs** for the function that cover a diverse mix of typical (general) cases and edge/boundary cases.
+Problem Description:
+'''
+{problem_description}
+'''
+Existing Examples from Problem:
+{examples_text}
+Function Implementation:
+```python
+{full_code}
+```
+{arg_type_text}
+Based on the existing examples above, generate 5 NEW diverse test inputs that are different from the existing ones. Each input should be a Python dict where:
+- Keys are the exact parameter names from the function signature
+- Values are appropriate test values for each parameter
+Format your response as:
+```python
+examples = [
+    {{dict_with_all_function_parameters}},  # Description of this test case
+    {{dict_with_all_function_parameters}},  # Description of this test case
+    ...  # Continue for all 5 examples
+]
+```
+Ensure your examples include:
+- At least 2 typical/general cases
+- At least 2 edge/boundary cases
+- 1 special case (empty, zero, maximum values, etc.)
+- All examples should be DIFFERENT from the existing examples shown above"""
+        return prompt
+    def _call_llm_for_inputs(self, prompt: str, existing_examples: List[Tuple[str, str]],
+                           func_info: Dict[str, Any], arg_type_info: str) -> List[Dict[str, Any]]:
+        """LLM을 호출하여 입력 생성 및 파싱"""
+        # 프롬프트 저장
+        self.last_generation_prompt = prompt
+        try:
+            # Input 생성용 전용 LLM 호출 (temperature=0.5)
+            if self.model is not None and self.tokenizer is not None:
+                # VLLM 사용 확인
+                try:
+                    from vllm import LLM
+                    if isinstance(self.model, LLM):
+                        response = self._generate_with_vllm_for_inputs(prompt)
+                    else:
+                        response = self._generate_with_hf_for_inputs(prompt)
+                except ImportError:
+                    response = self._generate_with_hf_for_inputs(prompt)
+                # 응답 저장
+                self.last_generation_response = response
+                # 응답에서 examples 추출
+                parsed_inputs = self._parse_generated_examples(response)
+                # 입력 생성 정보 저장
+                self.last_input_generation_info = {
+                    'prompt': prompt,
+                    'llm_response': response,
+                    'extracted_inputs': parsed_inputs,
+                    'existing_examples': existing_examples,
+                    'function_info': func_info,
+                    'arg_type_info': arg_type_info
+                }
+                return parsed_inputs
+            else:
+                # 모델이 없으면 빈 리스트 반환 (테스트 환경)
+                self.logger.log_warning("No model available for input generation")
+                self.last_generation_response = "No model available"
+                # 실패한 경우에도 정보 저장
+                self.last_input_generation_info = {
+                    'prompt': prompt,
+                    'llm_response': "No model available",
+                    'extracted_inputs': [],
+                    'existing_examples': existing_examples,
+                    'function_info': func_info,
+                    'arg_type_info': arg_type_info,
+                    'error': "No model available"
+                }
+                return []
+        except Exception as e:
+            self.logger.log_error(f"Failed to call LLM for inputs: {e}")
+            self.last_generation_response = f"Error: {str(e)}"
+            # 에러 발생 시에도 정보 저장
+            self.last_input_generation_info = {
+                'prompt': locals().get('prompt', 'N/A'),
+                'llm_response': f"Error: {str(e)}",
+                'extracted_inputs': [],
+                'existing_examples': locals().get('existing_examples', []),
+                'function_info': locals().get('func_info', {}),
+                'arg_type_info': locals().get('arg_type_info', 'N/A'),
+                'error': str(e)
+            }
+            return []
+    def _generate_with_vllm_for_inputs(self, prompt: str) -> str:
+        """Input 생성용 VLLM 백엔드 (temperature=0.5로 다양성 확보)"""
+        try:
+            from vllm import SamplingParams
+            # Input 생성용 높은 temperature 설정
+            sampling_params = SamplingParams(
+                temperature=0.5,        # 다양한 입력 생성을 위한 높은 temperature
+                max_tokens=2048,
+                top_p=0.95,            # 다양성을 위해 top_p 사용
+                stop=["\n```\n"],      # 코드 블록 종료 시 정지
+            )
+            outputs = self.model.generate([prompt], sampling_params, use_tqdm=False)
+            return outputs[0].outputs[0].text.replace("\t", "    ").strip()
+        except Exception as e:
+            self.logger.log_error(f"VLLM input generation failed: {e}")
+            return ""
+    def _generate_with_hf_for_inputs(self, prompt: str) -> str:
+        """Input 생성용 HuggingFace 백엔드 (temperature=0.5로 다양성 확보)"""
+        try:
+            import torch
+            # 토크나이저 처리
+            inputs = self.tokenizer(prompt, return_tensors='pt', truncation=True, max_length=4096)
+            # attention mask 명시적으로 설정
+            if 'attention_mask' not in inputs:
+                inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])
+            # 디바이스 이동
+            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                # 메모리 정리
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                # Input 생성용 sampling 설정
+                outputs = self.model.generate(
+                    inputs['input_ids'],
+                    attention_mask=inputs['attention_mask'],
+                    max_new_tokens=2048,
+                    do_sample=True,         # sampling 활성화
+                    temperature=0.5,        # 다양한 입력 생성을 위한 temperature
+                    top_p=0.95,            # 다양성을 위해 top_p 사용
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    eos_token_id=self.tokenizer.eos_token_id
+                )
+            # 응답 추출
+            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            response = response[len(prompt):].strip()
+            return response
+        except Exception as e:
+            self.logger.log_error(f"HuggingFace input generation failed: {e}")
+            return ""
+    def _parse_generated_examples(self, llm_response: str) -> List[Dict[str, Any]]:
+        """LLM 응답에서 예제 파싱"""
+        try:
+            # ```python ... ``` 블록에서 코드 추출
+            import re
+            code_pattern = r'```python\n(.*?)\n```'
+            matches = re.findall(code_pattern, llm_response, re.DOTALL)
+            if not matches:
+                # 블록이 없으면 전체 응답에서 examples = 찾기
+                if 'examples = [' in llm_response:
+                    start = llm_response.find('examples = [')
+                    # 균형잡힌 괄호 찾기
+                    bracket_count = 0
+                    end = start
+                    for i, char in enumerate(llm_response[start:]):
+                        if char == '[':
+                            bracket_count += 1
+                        elif char == ']':
+                            bracket_count -= 1
+                            if bracket_count == 0:
+                                end = start + i + 1
+                                break
+                    if end > start:
+                        code = llm_response[start:end]
+                        exec_globals = {}
+                        exec(code, exec_globals)
+                        return exec_globals.get('examples', [])
+            else:
+                # 코드 블록에서 examples 추출
+                code = matches[0]
+                exec_globals = {}
+                exec(code, exec_globals)
+                return exec_globals.get('examples', [])
+            return []
+        except Exception as e:
+            self.logger.log_error(f"Failed to parse generated examples: {e}")
+            return []
+    def _validate_generated_inputs(self, generated_inputs: List[Dict[str, Any]],
+                                 func_info: Dict[str, str],
+                                 solution: str) -> List[Dict[str, Any]]:
+        """생성된 입력의 유효성 검증"""
+        valid_inputs = []
+        func_name = func_info['name']
+        for i, input_dict in enumerate(generated_inputs):
+            try:
+                # 1. 필수 인자 확인
+                required_args = set(func_info['args'])
+                provided_args = set(input_dict.keys())
+                if not required_args.issubset(provided_args):
+                    self.logger.log_warning(f"Input {i+1} missing required args: {required_args - provided_args}")
+                    continue
+                # 2. 실제 실행으로 검증
+                # 인자를 순서대로 배열
+                args = [input_dict[arg] for arg in func_info['args'] if arg in input_dict]
+                # 실행 테스트
+                output = self._execute_llm_solution(solution, func_name, args)
+                if output is not None:
+                    valid_inputs.append(input_dict)
+                    self.logger.log_info(f"✅ Valid input {i+1}: {input_dict}")
+                else:
+                    self.logger.log_warning(f"❌ Input {i+1} execution failed")
+            except Exception as e:
+                self.logger.log_error(f"Input {i+1} validation error: {e}")
+        return valid_inputs
+    def create_ipo_from_input(self, problem: Dict[str, Any],
+                            solution: str,
+                            input_dict: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """새로운 입력으로 IPO triple 생성"""
+        try:
+            problem_id = problem.get('task_id', 'unknown')
+            entry_point = problem.get('entry_point', 'unknown')
+            # 함수 정보 추출
+            func_info = self._extract_function_info(solution, entry_point)
+            if not func_info:
+                return None
+            # 인자를 순서대로 배열
+            args = [input_dict[arg] for arg in func_info['args'] if arg in input_dict]
+            # 실행하여 출력 얻기
+            output = self._execute_llm_solution(solution, func_info['name'], args)
+            if output is None:
+                return None
+            # 입력 문자열 생성
+            args_str = ', '.join(repr(arg) for arg in args)
+            full_input_str = f"{func_info['name']}({args_str})"
+            # IPO triple 생성
+            triple_id = f"{problem_id}_generated_{len(self.extracted_triples)}"
+            triple = {
+                'id': triple_id,
+                'input': args_str,  # 실제 인자만
+                'full_input_str': full_input_str,  # 전체 함수 호출
+                'program': solution,
+                'expected_output': output,
+                'actual_output': output,
+                'function_name': func_info['name'],
+                'function_args': func_info['args'],
+                'is_correct': True,  # 생성된 것은 항상 정확
+                'extraction_method': 'generated'
+            }
+            return triple
+        except Exception as e:
+            self.logger.log_error(f"Failed to create IPO from input: {e}")
+            return None
+    def cleanup(self):
+        """리소스 정리"""
+        if hasattr(self.executor, 'cleanup'):
+            self.executor.cleanup()

absolute_zero_reasoner/testtime/logger.py ADDED Viewed

	@@ -0,0 +1,295 @@

+"""
+TestTime Logger
+TestTime RLVR을 위한 포괄적 로깅 시스템
+요구사항에 따른 모든 단계별 로그 기록
+"""
+import json
+import os
+import time
+from datetime import datetime
+from typing import Dict, List, Any, Optional
+from pathlib import Path
+import logging
+class TestTimeLogger:
+    """TestTime RLVR 전용 로거"""
+    def __init__(self, log_dir: str = "logs", log_level: str = "INFO", task_output_dir: str = None, log_file: str = None):
+        # 설계된 구조에 맞는 로그 디렉토리 설정
+        if task_output_dir:
+            # TTRLVR 통합 모드: 설계된 디렉토리 구조 사용
+            self.log_dir = Path(task_output_dir)
+            self.use_integrated_structure = True
+        else:
+            # 기존 모드: 기본 logs 디렉토리 사용
+            self.log_dir = Path(log_dir)
+            self.use_integrated_structure = False
+        self.log_dir.mkdir(parents=True, exist_ok=True)
+        # 디렉토리 구조에 따른 서브 디렉토리 생성
+        if self.use_integrated_structure:
+            # 설계된 구조: round_N 하위에 세부 디렉토리
+            (self.log_dir / "current_evaluation").mkdir(exist_ok=True)
+            (self.log_dir / "diverse_programs").mkdir(exist_ok=True)
+            (self.log_dir / "llm_responses").mkdir(exist_ok=True)
+            (self.log_dir / "azr_training_data").mkdir(exist_ok=True)
+        # 기존 구조에서는 서브 디렉토리를 생성하지 않음 (메인 로그 파일만)
+        # 기본 로거 설정
+        self.logger = logging.getLogger("TestTimeRLVR")
+        self.logger.setLevel(getattr(logging, log_level))
+        # 핸들러 설정
+        if not self.logger.handlers:
+            # 파일 핸들러
+            if log_file:
+                # 특정 로그 파일 경로가 주어진 경우 (Ray worker에서 사용)
+                self.log_file_path = log_file
+                file_handler = logging.FileHandler(log_file, mode='a')  # append mode
+            else:
+                # 기본 로그 파일 생성
+                self.log_file_path = str(self.log_dir / f"testtime_rlvr_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
+                file_handler = logging.FileHandler(self.log_file_path)
+            file_handler.setLevel(logging.DEBUG)
+            # 콘솔 핸들러
+            console_handler = logging.StreamHandler()
+            console_handler.setLevel(getattr(logging, log_level))
+            # 포맷터
+            formatter = logging.Formatter(
+                '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+            )
+            file_handler.setFormatter(formatter)
+            console_handler.setFormatter(formatter)
+            self.logger.addHandler(file_handler)
+            self.logger.addHandler(console_handler)
+    def _get_timestamp(self) -> str:
+        """현재 타임스탬프 반환"""
+        return datetime.now().isoformat()
+    def _save_json_log(self, subdirectory: str, filename: str, data: Dict[str, Any]):
+        """JSON 로그 파일 저장"""
+        if self.use_integrated_structure:
+            # 설계된 구조: 각 카테고리별로 적절한 디렉토리에 저장
+            if subdirectory == "ipo_extraction":
+                # IPO 추출 로그는 diverse_programs 하위에 별도로 저장
+                log_path = self.log_dir / "diverse_programs" / f"{filename}.json"
+            elif subdirectory == "task_generation":
+                # Task generation 로그는 round 레벨에 저장 (모든 task 종류 포함)
+                log_path = self.log_dir / f"{filename}.json"
+            elif subdirectory == "problems":
+                log_path = self.log_dir / "current_evaluation" / f"{filename}.json"
+            elif subdirectory == "performance":
+                log_path = self.log_dir / "current_evaluation" / f"{filename}.json"
+            elif subdirectory == "training":
+                log_path = self.log_dir / "azr_training_data" / f"{filename}.json"
+            else:
+                # 기본값
+                log_path = self.log_dir / subdirectory / f"{filename}.json"
+        else:
+            # 기존 구조
+            log_path = self.log_dir / subdirectory / f"{filename}.json"
+        # 디렉토리 생성 (없다면)
+        log_path.parent.mkdir(parents=True, exist_ok=True)
+        # 기존 로그 로드 (있다면)
+        if log_path.exists():
+            with open(log_path, 'r', encoding='utf-8') as f:
+                existing_logs = json.load(f)
+        else:
+            existing_logs = []
+        # 새 로그 추가
+        data['timestamp'] = self._get_timestamp()
+        existing_logs.append(data)
+        # 저장
+        with open(log_path, 'w', encoding='utf-8') as f:
+            json.dump(existing_logs, f, indent=2, ensure_ascii=False)
+    # ============================================================================
+    # 1. 벤치마크 문제 로깅 (요구사항 1)
+    # ============================================================================
+    def log_problem_attempt(self, problem: Dict[str, Any], solution: str,
+                           is_correct: bool, validation_result: Optional[Dict] = None):
+        """벤치마크 문제와 LLM 답변, 정답 여부 로그"""
+        log_data = {
+            'problem_id': problem.get('task_id', 'unknown'),
+            'benchmark': problem.get('benchmark_name', 'unknown'),
+            'problem_prompt': problem.get('prompt', ''),
+            'canonical_solution': problem.get('canonical_solution', ''),
+            'llm_solution': solution,
+            'is_correct': is_correct,
+            'validation_result': validation_result or {}
+        }
+        self._save_json_log("problems", f"problem_{problem.get('task_id', 'unknown').replace('/', '_')}", log_data)
+        status = "✅ CORRECT" if is_correct else "❌ INCORRECT"
+        self.logger.info(f"Problem {problem.get('task_id', 'unknown')}: {status}")
+    def log_problem_loaded(self, problem_id: str, benchmark_name: str, method: str = "Original"):
+        """문제 로딩 로그 (EvalPlus/Original 방식 구분)"""
+        self.logger.info(f"Loaded problem {problem_id} from {benchmark_name} ({method} method)")
+    # ============================================================================
+    # 2. IPO 추출 로깅 (요구사항 2)
+    # ============================================================================
+    def log_ipo_extraction(self, problem_id: str, extracted_triples: List[Dict],
+                          validation_results: List[bool]):
+        """생성된 (i,p,o) 트리플과 검증 결과 로그"""
+        log_data = {
+            'problem_id': problem_id,
+            'num_triples': len(extracted_triples),
+            'triples': extracted_triples,
+            'validation_results': validation_results,
+            'valid_triples': sum(validation_results),
+            'invalid_triples': len(validation_results) - sum(validation_results)
+        }
+        self._save_json_log("ipo_extraction", f"ipo_{problem_id.replace('/', '_')}", log_data)
+        self.logger.info(f"IPO Extraction for {problem_id}: {len(extracted_triples)} triples, "
+                        f"{sum(validation_results)} valid")
+    # ============================================================================
+    # 3. 태스크 생성 로깅 (요구사항 2)
+    # ============================================================================
+    def log_task_generation(self, problem_id: str, induction_tasks: List[Dict],
+                           deduction_tasks: List[Dict], abduction_tasks: List[Dict]):
+        """생성된 induction, deduction, abduction 문제 로그"""
+        log_data = {
+            'problem_id': problem_id,
+            'induction_tasks': {
+                'count': len(induction_tasks),
+                'tasks': induction_tasks
+            },
+            'deduction_tasks': {
+                'count': len(deduction_tasks),
+                'tasks': deduction_tasks
+            },
+            'abduction_tasks': {
+                'count': len(abduction_tasks),
+                'tasks': abduction_tasks
+            },
+            'total_tasks': len(induction_tasks) + len(deduction_tasks) + len(abduction_tasks)
+        }
+        self._save_json_log("task_generation", f"tasks_{problem_id.replace('/', '_')}", log_data)
+        total_tasks = log_data['total_tasks']
+        self.logger.info(f"Task Generation for {problem_id}: {total_tasks} tasks "
+                        f"(I:{len(induction_tasks)}, D:{len(deduction_tasks)}, A:{len(abduction_tasks)})")
+    # ============================================================================
+    # 4. 학습 메트릭 로깅 (요구사항 3, 4)
+    # ============================================================================
+    def log_task_accuracy(self, problem_id: str, task_type: str, accuracy: float,
+                         rewards: List[float], step: int):
+        """induction/deduction/abduction 태스크 정확도와 reward 로그"""
+        log_data = {
+            'problem_id': problem_id,
+            'task_type': task_type,  # 'induction', 'deduction', 'abduction'
+            'step': step,
+            'accuracy': accuracy,
+            'rewards': rewards,
+            'avg_reward': sum(rewards) / len(rewards) if rewards else 0.0,
+            'max_reward': max(rewards) if rewards else 0.0,
+            'min_reward': min(rewards) if rewards else 0.0
+        }
+        self._save_json_log("training", f"accuracy_{problem_id.replace('/', '_')}", log_data)
+        self.logger.info(f"Step {step} - {task_type.capitalize()} accuracy: {accuracy:.4f}, "
+                        f"avg reward: {log_data['avg_reward']:.4f}")
+    def log_verl_training(self, problem_id: str, step: int, loss: float,
+                         learning_rate: float, metrics: Dict[str, Any]):
+        """VeRL 학습 진행 상황 로그"""
+        log_data = {
+            'problem_id': problem_id,
+            'step': step,
+            'loss': loss,
+            'learning_rate': learning_rate,
+            'metrics': metrics
+        }
+        self._save_json_log("training", f"verl_{problem_id.replace('/', '_')}", log_data)
+        self.logger.info(f"VeRL Training Step {step}: loss={loss:.6f}, lr={learning_rate:.2e}")
+    # ============================================================================
+    # 5. 성능 변화 로깅
+    # ============================================================================
+    def log_performance_change(self, problem_id: str, cycle: int,
+                              before_accuracy: float, after_accuracy: float,
+                              improvement: float):
+        """매 사이클별 성능 변화 로그"""
+        log_data = {
+            'problem_id': problem_id,
+            'cycle': cycle,
+            'before_accuracy': before_accuracy,
+            'after_accuracy': after_accuracy,
+            'improvement': improvement,
+            'improvement_percentage': improvement * 100
+        }
+        self._save_json_log("performance", f"cycle_{problem_id.replace('/', '_')}", log_data)
+        direction = "↗️" if improvement > 0 else "↘️" if improvement < 0 else "→"
+        self.logger.info(f"Cycle {cycle} Performance: {before_accuracy:.4f} → {after_accuracy:.4f} "
+                        f"({direction} {improvement:+.4f})")
+    # ============================================================================
+    # 일반 로깅
+    # ============================================================================
+    def log_info(self, message: str):
+        """일반 정보 로그"""
+        self.logger.info(message)
+    def log_error(self, message: str):
+        """에러 로그"""
+        self.logger.error(message)
+    def log_warning(self, message: str):
+        """경고 로그"""
+        self.logger.warning(message)
+    def log_debug(self, message: str):
+        """디버그 로그"""
+        self.logger.debug(message)
+    def get_log_summary(self) -> Dict[str, Any]:
+        """로그 요약 정보 반환"""
+        summary = {
+            'log_directory': str(self.log_dir),
+            'subdirectories': {
+                'problems': len(list((self.log_dir / "problems").glob("*.json"))),
+                'ipo_extraction': len(list((self.log_dir / "ipo_extraction").glob("*.json"))),
+                'task_generation': len(list((self.log_dir / "task_generation").glob("*.json"))),
+                'training': len(list((self.log_dir / "training").glob("*.json"))),
+                'performance': len(list((self.log_dir / "performance").glob("*.json")))
+            }
+        }
+        return summary

absolute_zero_reasoner/testtime/prompts.py ADDED Viewed

	@@ -0,0 +1,413 @@

+"""
+TestTime RLVR 프롬프트 중앙 관리 시스템
+모든 프롬프트를 한 곳에서 관리하여 일관성과 유지보수성을 향상시킵니다.
+"""
+from typing import Dict, List, Any
+from dataclasses import dataclass
+from enum import Enum
+class PromptType(Enum):
+    """프롬프트 유형 정의"""
+    SOLUTION_GENERATION = "solution_generation"
+    DIVERSE_GENERATION = "diverse_generation"
+    INPUT_GENERATION = "input_generation"
+    TASK_GENERATION = "task_generation"
+    TASK_EVALUATION = "task_evaluation"
+class BenchmarkType(Enum):
+    """벤치마크 유형 정의"""
+    HUMANEVAL = "humaneval"
+    MBPP = "mbpp"
+    GENERAL = "general"
+@dataclass
+class PromptTemplate:
+    """프롬프트 템플릿 데이터 클래스"""
+    name: str
+    template: str
+    description: str
+    benchmark: BenchmarkType
+    temperature: float = 0.05
+    variables: List[str] = None
+    def __post_init__(self):
+        if self.variables is None:
+            self.variables = []
+class PromptManager:
+    """프롬프트 중앙 관리 클래스"""
+    def __init__(self):
+        self.prompts = self._initialize_prompts()
+    def _initialize_prompts(self) -> Dict[str, PromptTemplate]:
+        """모든 프롬프트 템플릿 초기화"""
+        prompts = {}
+        # ================================================================================
+        # 1. SOLUTION GENERATION PROMPTS (Current Evaluation - 베이스라인)
+        # ================================================================================
+        # HumanEval 기본 솔루션 생성
+        prompts["solution_humaneval_basic"] = PromptTemplate(
+            name="HumanEval 기본 솔루션 생성",
+            benchmark=BenchmarkType.HUMANEVAL,
+            temperature=0.05,
+            description="HumanEval 문제에 대한 기본 솔루션 생성 (greedy)",
+            variables=["problem_prompt"],
+            template="""You are a Python writing assistant. Complete the following Python function.
+{problem_prompt}
+Please provide a complete implementation of the function."""
+        )
+        # HumanEval 다중 함수 처리
+        prompts["solution_humaneval_multi"] = PromptTemplate(
+            name="HumanEval 다중 함수 솔루션 생성",
+            benchmark=BenchmarkType.HUMANEVAL,
+            temperature=0.05,
+            description="여러 함수가 있는 HumanEval 문제 처리",
+            variables=["problem_prompt", "entry_point"],
+            template="""You are a Python writing assistant. Complete the following Python function.
+{problem_prompt}
+Please provide ONLY the implementation for the function `{entry_point}`.
+Complete the body of the `{entry_point}` function where it is incomplete.
+Do not modify or reimplement other functions that are already complete."""
+        )
+        # MBPP 기본 솔루션 생성
+        prompts["solution_mbpp_basic"] = PromptTemplate(
+            name="MBPP 기본 솔루션 생성",
+            benchmark=BenchmarkType.MBPP,
+            temperature=0.05,
+            description="MBPP 문제에 대한 기본 솔루션 생성",
+            variables=["problem_prompt"],
+            template="""
+Please generate a complete, self-contained Python script that solves the following problem.
+CRITICAL REQUIREMENTS:
+- You MUST maintain the EXACT function signature as shown in the examples
+- The function name, parameter names, parameter types, and parameter count MUST match exactly with the examples
+- Look at the assert statements carefully to understand the expected function signature
+- DO NOT change the number of parameters or their types from what is shown in the examples
+Instructions:
+- Wrap the entire script in a Markdown code block with syntax highlighting (```python ... ```).
+- For each function, include a concise docstring enclosed in triple single quotes (''' ... '''), placed immediately below the def line.
+The docstring should briefly describe:
+• The function's purpose
+• Input parameters
+• Return value
+Problem statement:
+{problem_prompt}
+"""
+        )
+        # ================================================================================
+        # 2. DIVERSE GENERATION PROMPTS (다양한 프로그램 생성)
+        # ================================================================================
+        # HumanEval 다양성 솔루션
+        prompts["diverse_humaneval_basic"] = PromptTemplate(
+            name="HumanEval 다양성 솔루션 생성",
+            benchmark=BenchmarkType.HUMANEVAL,
+            temperature=0.7,
+            description="HumanEval 문제에 대한 다양한 접근법 솔루션",
+            variables=["diversity_instruction", "problem_prompt"],
+            template="""You are a Python writing assistant. {diversity_instruction}
+{problem_prompt}
+Please provide a complete implementation of the function."""
+        )
+        # HumanEval 다양성 다중 함수
+        prompts["diverse_humaneval_multi"] = PromptTemplate(
+            name="HumanEval 다양성 다중 함수 솔루션",
+            benchmark=BenchmarkType.HUMANEVAL,
+            temperature=0.7,
+            description="다중 함수 HumanEval에 대한 다양성 솔루션",
+            variables=["diversity_instruction", "problem_prompt", "entry_point"],
+            template="""You are a Python writing assistant. {diversity_instruction}
+{problem_prompt}
+Please provide ONLY the implementation for the function `{entry_point}`.
+Complete the body of the `{entry_point}` function where it is incomplete.
+Do not modify or reimplement other functions that are already complete."""
+        )
+        # MBPP 다양성 솔루션
+        prompts["diverse_mbpp_basic"] = PromptTemplate(
+            name="MBPP 다양성 솔루션 생성",
+            benchmark=BenchmarkType.MBPP,
+            temperature=0.7,
+            description="MBPP 문제에 대한 다양한 접근법 솔루션",
+            variables=["diversity_instruction", "problem_prompt"],
+            template="""Please generate a complete, self-contained Python script that solves the following problem.
+CRITICAL REQUIREMENTS:
+- You MUST maintain the EXACT function signature as shown in the examples
+- The function name, parameter names, parameter types, and parameter count MUST match exactly with the examples
+- Look at the assert statements carefully to understand the expected function signature
+- DO NOT change the number of parameters or their types from what is shown in the examples
+Instructions:
+- Wrap the entire script in a Markdown code block with syntax highlighting (```python ... ```).
+- For each function, include a concise docstring enclosed in triple single quotes (''' ... '''), placed immediately below the def line.
+The docstring should briefly describe:
+• The function's purpose
+• Input parameters
+• Return value
+{diversity_instruction}
+Problem statement:
+{problem_prompt}
+"""
+        )
+        # ================================================================================
+        # 3. INPUT GENERATION PROMPTS (입력 증강)
+        # ================================================================================
+        prompts["input_generation_basic"] = PromptTemplate(
+            name="기본 입력 생성",
+            benchmark=BenchmarkType.GENERAL,
+            temperature=0.5,
+            description="기존 IPO 예제를 바탕으로 새로운 입력 생성",
+            variables=["problem_description", "existing_examples", "full_code", "arg_type_info"],
+            template="""Given the following problem description and its Python function implementation, first analyze the types and valid ranges of the function arguments, then write **5 different example inputs** for the function that cover a diverse mix of typical (general) cases and edge/boundary cases.
+Problem Description:
+'''
+{problem_description}
+'''
+Existing Examples from Problem:
+{existing_examples}
+Function Implementation:
+```python
+{full_code}
+```
+{arg_type_info}
+Based on the existing examples above, generate 5 NEW diverse test inputs that are different from the existing ones. Each input should be a Python dict where:
+- Keys are the exact parameter names from the function signature
+- Values are appropriate test values for each parameter
+Format your response as:
+```python
+examples = [
+    {{dict_with_all_function_parameters}},  # Description of this test case
+    {{dict_with_all_function_parameters}},  # Description of this test case
+    ...  # Continue for all 5 examples
+]
+```
+Ensure your examples include:
+- At least 2 typical/general cases
+- At least 2 edge/boundary cases
+- 1 special case (empty, zero, maximum values, etc.)
+- All examples should be DIFFERENT from the existing examples shown above"""
+        )
+        # ================================================================================
+        # 4. TASK GENERATION PROMPTS (IPO → 추론 태스크)
+        # ================================================================================
+        prompts["task_induction"] = PromptTemplate(
+            name="Induction 태스크 생성 (AZR code_f)",
+            benchmark=BenchmarkType.GENERAL,
+            temperature=0.05,
+            description="주어진 입력-출력으로부터 프로그램 추론 (AZR 원본)",
+            variables=["input_output_pairs", "message"],
+            template="""A conversation between User and Assistant.
+The User provides a set of input/output pairs and a message describing the hidden function. The Assistant must:
+1. **Privately think step-by-step** about how to reconstruct the general function based on the provided examples.
+2. **Output exactly one** `<think>...</think>` block containing the full reasoning process.
+3. **Then output exactly one** `<answer>...</answer>` block containing **only** the Python code snippet defining the function `f`—no labels, no comments, no extra text.
+4. **Do not** generate any text outside these two blocks.
+5. Follow to the **code requirements** and **formatting rules**.
+# Code Requirements:
+- Name the entry function `f` (e.g., `def f(...): ...`), you may include nested definitions inside `f`.
+- Ensure the function returns a value.
+- Include at least one input parameter.
+- Make the function deterministic.
+- AVOID the FOLLOWING:
+  * Random functions or variables
+  * Date/time operations
+  * I/O operations (reading files, network requests)
+  * Printing or logging
+  * Any external state
+- Ensure execution completes within 10 seconds on a modern CPU.
+- All imports and custom class definitions must be at the very top of the code snippet.
+- The snippet must end with a return statement from the main function `f`; anything after will be removed.
+User:
+# Input and Output Pairs:
+{input_output_pairs}
+# Message:
+{message}"""
+        )
+        prompts["task_deduction"] = PromptTemplate(
+            name="Deduction 태스크 생성 (AZR code_o)",
+            benchmark=BenchmarkType.GENERAL,
+            temperature=0.05,
+            description="주어진 프로그램과 입력으로부터 출력 추론 (AZR 원본)",
+            variables=["snippet", "input_args"],
+            template="""A conversation between User and Assistant.
+The User provides a Python code snippet and specific input values. The Assistant must:
+1. **Privately think step-by-step** about how the code executes with the given inputs.
+2. **Output exactly one** `<think>...</think>` block containing your full reasoning.
+3. **Then output exactly one** `<answer>...</answer>` block containing **only** the output values—no labels, no comments, no extra text.
+4. **Do not** generate any text outside these two blocks.
+5. Adhere to the **output rules**.
+# Output Rules:
+- If the output is a string, wrap it in quotes.
+- For dicts, lists, and other literals, use valid Python literal notation.
+User:
+# Python Code Snippet:
+{snippet}
+# Input:
+{input_args}"""
+        )
+        prompts["task_abduction"] = PromptTemplate(
+            name="Abduction 태스크 생성 (AZR code_i)",
+            benchmark=BenchmarkType.GENERAL,
+            temperature=0.05,
+            description="주어진 프로그램과 출력으로부터 입력 추론 (AZR 원본)",
+            variables=["snippet", "output"],
+            template="""A conversation between User and Assistant.
+The User provides a Python code snippet and its observed output. The Assistant must:
+1. **Privately think step-by-step** about which input produces that output.
+2. **Output exactly one** `<think>...</think>` block containing your full reasoning.
+3. **Then output exactly one** `<answer>...</answer>` block containing **only** the input values—no labels, no comments, no extra text.
+4. **Do not** generate any text outside these two blocks.
+5. Adhere to the **input rules**.
+# Input Rules:
+- If an argument is a string, wrap it in quotes.
+- For multiple arguments, separate by commas.
+- Use Python literal notation for lists, dicts, tuples.
+- Boolean values must be `True` or `False`.
+User:
+# Python Code Snippet:
+{snippet}
+# Observed Output:
+{output}"""
+        )
+        # ================================================================================
+        # 5. TASK EVALUATION PROMPTS (LLM 태스크 응답)
+        # ================================================================================
+        prompts["task_evaluation_basic"] = PromptTemplate(
+            name="기본 태스크 평가",
+            benchmark=BenchmarkType.GENERAL,
+            temperature=0.05,
+            description="생성된 추론 태스크에 대한 LLM 응답",
+            variables=["task_prompt"],
+            template="{task_prompt}"
+        )
+        return prompts
+    def get_prompt(self, prompt_key: str, **kwargs) -> str:
+        """프롬프트 키로 템플릿을 가져와 변수를 채움"""
+        if prompt_key not in self.prompts:
+            raise ValueError(f"Unknown prompt key: {prompt_key}")
+        template = self.prompts[prompt_key]
+        # 필수 변수 확인
+        missing_vars = []
+        for var in template.variables:
+            if var not in kwargs:
+                missing_vars.append(var)
+        if missing_vars:
+            raise ValueError(f"Missing required variables for prompt '{prompt_key}': {missing_vars}")
+        # 템플릿 포맷팅
+        try:
+            return template.template.format(**kwargs)
+        except KeyError as e:
+            raise ValueError(f"Template formatting error for prompt '{prompt_key}': {e}")
+    def get_temperature(self, prompt_key: str) -> float:
+        """프롬프트의 권장 temperature 반환"""
+        if prompt_key not in self.prompts:
+            raise ValueError(f"Unknown prompt key: {prompt_key}")
+        return self.prompts[prompt_key].temperature
+    def get_diversity_instruction(self, variation_id: int) -> str:
+        """variation_id에 따른 다양성 지시문 반환"""
+        diversity_instructions = [
+            "",  # 기본
+            "",
+            "",
+            ""
+        ]
+        # diversity_instructions = [
+        #     "",  # 기본
+        #     "Implement this in a robust way that works well for various examples",
+        #     "Provide an alternative solution with a unique implementation style:",
+        #     "Try to implement using a different approach, algorithm, or coding style than typical solutions."
+        # ]
+        return diversity_instructions[variation_id % len(diversity_instructions)]
+    def list_prompts(self) -> Dict[str, PromptTemplate]:
+        """모든 프롬프트 템플릿 목록 반환"""
+        return self.prompts.copy()
+    def get_prompts_by_type(self, benchmark: BenchmarkType) -> Dict[str, PromptTemplate]:
+        """벤치마크 타입별 프롬프트 반환"""
+        return {
+            key: template for key, template in self.prompts.items()
+            if template.benchmark == benchmark or template.benchmark == BenchmarkType.GENERAL
+        }
+# 전역 프롬프트 매니저 인스턴스
+prompt_manager = PromptManager()
+# 편의 함수들
+def get_prompt(prompt_key: str, **kwargs) -> str:
+    """프롬프트 가져오기 편의 함수"""
+    return prompt_manager.get_prompt(prompt_key, **kwargs)
+def get_temperature(prompt_key: str) -> float:
+    """프롬프트 temperature 가져오기 편의 함수"""
+    return prompt_manager.get_temperature(prompt_key)
+def get_diversity_instruction(variation_id: int) -> str:
+    """다양성 지시문 가져오기 편의 함수"""
+    return prompt_manager.get_diversity_instruction(variation_id)

absolute_zero_reasoner/testtime/solution_generator.py ADDED Viewed

	@@ -0,0 +1,877 @@

+"""
+Initial Solution Generator
+AZR 기반 TestTime RLVR을 위한 초기 솔루션 생성기
+기존 Test-Time-RLVR의 generate_initial_solution 함수를 클래스화하여 확장
+"""
+import re
+import torch
+from typing import Dict, Any, Optional, Tuple, List
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from .config import TestTimeConfig
+from .logger import TestTimeLogger
+from .prompts import get_prompt, get_temperature, get_diversity_instruction
+# AZR에서 사용하는 코드 추출 함수 직접 임포트
+from ..rewards.custom_evaluate import extract_code
+# VLLM 최적화 지원
+try:
+    from vllm import LLM, SamplingParams
+    VLLM_AVAILABLE = True
+except ImportError:
+    VLLM_AVAILABLE = False
+class InitialSolutionGenerator:
+    """벤치마크 문제에 대한 초기 솔루션 생성"""
+    def __init__(self, model, tokenizer, config: TestTimeConfig,
+                 logger: Optional[TestTimeLogger] = None, use_vllm: bool = True):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.config = config
+        self.logger = logger or TestTimeLogger()
+        self.use_vllm = use_vllm and VLLM_AVAILABLE
+        # VLLM 사용 가능 여부 확인 및 로깅
+        if use_vllm and not VLLM_AVAILABLE:
+            self.logger.log_info("⚠️  VLLM requested but not available, falling back to HuggingFace")
+        elif self.use_vllm:
+            self.logger.log_info("🚀 Using VLLM for optimized inference")
+        else:
+            self.logger.log_info("🔧 Using HuggingFace Transformers for inference")
+    def generate(self, problem: Dict[str, Any]) -> str:
+        """문제에 대한 초기 솔루션 생성 (AZR 코드 평가 프롬프트 사용)"""
+        problem_prompt = problem['prompt']
+        problem_id = problem.get('task_id', 'unknown')
+        # AZR 코드 평가에서 사용하는 프롬프트 포맷 적용
+        # prompt = f"Please provide a self-contained Python script that solves the following problem in a markdown code block:\n\n{problem_prompt}"
+        # 중앙 프롬프트 시스템 사용
+        if 'HumanEval' in problem_id:
+            # entry_point 함수명 찾기
+            entry_point = problem.get('entry_point', 'unknown')
+            # 프롬프트에서 함수가 여러 개 있는지 확인
+            import re
+            function_count = len(re.findall(r'^\s*def\s+\w+', problem_prompt, re.MULTILINE))
+            if function_count > 1:
+                # 다중 함수 프롬프트 사용
+                prompt = get_prompt("solution_humaneval_multi",
+                                  problem_prompt=problem_prompt,
+                                  entry_point=entry_point)
+            else:
+                # 단일 함수 프롬프트 사용
+                prompt = get_prompt("solution_humaneval_basic",
+                                  problem_prompt=problem_prompt)
+        else:
+            # MBPP 프롬프트 사용
+            prompt = get_prompt("solution_mbpp_basic",
+                              problem_prompt=problem_prompt)
+        self.logger.log_info(f"🔍 Generating initial solution for {problem_id}")
+        self.logger.log_info(f"📋 Full prompt: {prompt}")
+        # VLLM 또는 HuggingFace 백엔드 선택
+        if self.use_vllm and isinstance(self.model, LLM):
+            solution = self._generate_with_vllm(prompt)
+        else:
+            solution = self._generate_with_huggingface(prompt)
+        # 마크다운 코드 블록에서 Python 코드 추출 (개선된 방식)
+        extracted_solution = self._extract_python_code(solution)
+        # 코드 추출 결과 로깅
+        if extracted_solution and extracted_solution != solution:
+            self.logger.log_info(f"🔍 Extracted Python code from markdown block")
+            solution = extracted_solution
+        elif not extracted_solution:
+            self.logger.log_info(f"🔍 No markdown code block found, using original text")
+        # HumanEval의 경우 프롬프트에서 import 추출하여 추가 (EvalPlus 방식)
+        if 'HumanEval' in problem_id:
+            solution = self._add_imports_from_prompt(solution, problem_prompt)
+        # 함수 정의 복구 (AZR 로직 그대로)
+        solution = self._fix_function_definition(solution, prompt, problem_id)
+        self.logger.log_info(f"✅ Generated solution ({len(solution)} chars)")
+        self.logger.log_info(f"🔍 Solution preview: {solution[:200]}...")
+        # 디버깅: 실제 솔루션 내용 로깅
+        self.logger.log_info(f"🔍 Full solution for debugging:")
+        self.logger.log_info(f"--- START SOLUTION ---")
+        self.logger.log_info(solution)
+        self.logger.log_info(f"--- END SOLUTION ---")
+        return solution
+    def generate_diverse(self, problem: Dict[str, Any], temperature: float = 0.7, variation_id: int = 0) -> str:
+        """다양한 솔루션 생성 (높은 temperature 사용)"""
+        problem_prompt = problem['prompt']
+        problem_id = problem.get('task_id', 'unknown')
+        # 중앙 관리 다양성 프롬프트 시스템 사용
+        diversity_instruction = get_diversity_instruction(variation_id)
+        # HumanEval에 대해서는 함수 완성 요청 (다양성 버전)
+        if 'HumanEval' in problem_id:
+            entry_point = problem.get('entry_point', 'unknown')
+            import re
+            function_count = len(re.findall(r'^\s*def\s+\w+', problem_prompt, re.MULTILINE))
+            if function_count > 1:
+                prompt = get_prompt("diverse_humaneval_multi",
+                                  diversity_instruction=diversity_instruction,
+                                  problem_prompt=problem_prompt,
+                                  entry_point=entry_point)
+            else:
+                prompt = get_prompt("diverse_humaneval_basic",
+                                  diversity_instruction=diversity_instruction,
+                                  problem_prompt=problem_prompt)
+        else:
+            # MBPP 다양성 프롬프트 사용
+            prompt = get_prompt("diverse_mbpp_basic",
+                              diversity_instruction=diversity_instruction,
+                              problem_prompt=problem_prompt)
+        self.logger.log_info(f"🎨 Generating diverse solution #{variation_id+1} for {problem_id}")
+        # 다양성 생성 메서드 사용
+        try:
+            from vllm import LLM
+            if isinstance(self.model, LLM):
+                solution = self._generate_with_vllm_diverse(prompt, temperature)
+            else:
+                solution = self._generate_with_huggingface_diverse(prompt, temperature)
+        except ImportError:
+            solution = self._generate_with_huggingface_diverse(prompt, temperature)
+        # 코드 추출 및 후처리 (기존과 동일)
+        extracted_solution = self._extract_python_code(solution)
+        if extracted_solution and extracted_solution != solution:
+            self.logger.log_info(f"🔍 Extracted Python code from markdown block")
+            solution = extracted_solution
+        if 'HumanEval' in problem_id:
+            solution = self._add_imports_from_prompt(solution, problem_prompt)
+        solution = self._fix_function_definition(solution, prompt, problem_id)
+        self.logger.log_info(f"✅ Generated diverse solution #{variation_id+1} ({len(solution)} chars)")
+        return solution
+    def _generate_with_vllm(self, prompt: str) -> str:
+        """VLLM 백엔드로 생성 (AZR 방식)"""
+        # AZR evaluation과 동일한 SamplingParams 설정
+        sampling_params = SamplingParams(
+            temperature=0.05,
+            max_tokens=2048,       # AZR 평가 설정
+            top_p=1.0,             # greedy mode
+            stop=["\n```\n"],      # 코드 블록 종료 시 정지
+        )
+        # VLLM 생성
+        outputs = self.model.generate([prompt], sampling_params, use_tqdm=False)
+        solution = outputs[0].outputs[0].text.replace("\t", "    ")  # AZR 방식 탭 처리
+        return solution.strip()
+    def _generate_with_vllm_diverse(self, prompt: str, temperature: float = 0.7) -> str:
+        """다양한 솔루션 생성용 VLLM 백엔드 (높은 temperature)"""
+        # 다양성을 위한 SamplingParams 설정
+        sampling_params = SamplingParams(
+            temperature=temperature,   # 높은 temperature로 다양성 확보
+            max_tokens=2048,
+            top_p=0.95,               # 다양성을 위해 top_p 사용
+            stop=["\n```\n"],         # 코드 블록 종료 시 정지
+        )
+        # VLLM 생성
+        outputs = self.model.generate([prompt], sampling_params, use_tqdm=False)
+        solution = outputs[0].outputs[0].text.replace("\t", "    ")
+        return solution.strip()
+    def generate_batch(self, prompts: List[str], temperature: float = 0.7) -> List[str]:
+        """배치로 여러 프롬프트 동시 처리"""
+        # 실제 모델 타입 확인 (VLLM 로딩 실패 시 HuggingFace 모델이 로드됨)
+        if self.use_vllm and isinstance(self.model, LLM):
+            raw_solutions = self._generate_batch_with_vllm(prompts, temperature)
+        else:
+            # HuggingFace는 순차 처리 (fallback)
+            raw_solutions = [self._generate_with_huggingface(prompt) for prompt in prompts]
+        # 각 솔루션에 대해 후처리 수행
+        processed_solutions = []
+        for i, (prompt, solution) in enumerate(zip(prompts, raw_solutions)):
+            # 1. 마크다운에서 Python 코드 추출
+            extracted = self._extract_python_code(solution)
+            if extracted and extracted != solution:
+                self.logger.log_info(f"🔍 Extracted Python code from markdown block for batch item {i+1}")
+                solution = extracted
+            # 2. HumanEval 문제인 경우 import 추가
+            # 프롬프트에서 problem ID 추출 (프롬프트에 포함되어 있다고 가정)
+            if 'HumanEval' in prompt:
+                # 프롬프트에서 원본 problem prompt 추출 시도
+                # 프롬프트 구조에 따라 조정 필요
+                solution = self._add_imports_from_prompt(solution, prompt)
+            # 3. 함수 정의 수정 (필요한 경우)
+            # generate_diverse와 동일한 처리
+            solution = self._fix_function_definition(solution, prompt)
+            processed_solutions.append(solution)
+        return processed_solutions
+    def _generate_batch_with_vllm(self, prompts: List[str], temperature: float = 0.7) -> List[str]:
+        """VLLM으로 배치 처리"""
+        # VLLM 샘플링 파라미터
+        # seed를 제거하여 매번 다른 응답 생성
+        sampling_params = SamplingParams(
+            temperature=temperature,
+            top_p=0.85,
+            max_tokens=1024,
+            stop=[]  # stop 토큰 명시적으로 비움
+        )
+        # VLLM 배치 생성
+        outputs = self.model.generate(prompts, sampling_params, use_tqdm=False)
+        # 결과 추출
+        solutions = []
+        for i, output in enumerate(outputs):
+            solution = output.outputs[0].text.replace("\t", "    ")
+            # 디버깅: finish_reason 확인
+            finish_reason = output.outputs[0].finish_reason
+            if finish_reason != "stop" and i < 3:  # 처음 3개만 로깅
+                self.logger.log_warning(f"Output {i} finish_reason: {finish_reason}, length: {len(solution)}")
+            solutions.append(solution.strip())
+        return solutions
+    def _generate_with_huggingface(self, prompt: str) -> str:
+        """HuggingFace 백엔드로 생성 (attention mask 수정)"""
+        # 토크나이저 처리 (attention mask 경고 수정)
+        inputs = self.tokenizer(prompt, return_tensors='pt', truncation=True, max_length=4096)
+        # attention mask 명시적으로 설정
+        if 'attention_mask' not in inputs:
+            inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])
+        # 디바이스 이동 (AZR 방식 그대로)
+        device = getattr(self.model, 'device', 'cuda' if torch.cuda.is_available() else 'cpu')
+        if isinstance(device, str):
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+        else:
+            # 모델이 이미 특정 디바이스에 있는 경우
+            inputs = {k: v.to(next(self.model.parameters()).device) for k, v in inputs.items()}
+        with torch.no_grad():
+            # 메모리 정리 (AZR 방식 그대로)
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            # AZR evaluation과 동일한 greedy 설정
+            outputs = self.model.generate(
+                inputs['input_ids'],
+                attention_mask=inputs['attention_mask'],  # attention mask 명시적으로 전달
+                max_new_tokens=2048,  # 원래 AZR 평가 설정
+                do_sample=False,      # greedy mode (--greedy와 동일)
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+        # 솔루션 추출 (AZR 방식 그대로)
+        solution = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        solution = solution[len(prompt):].strip()
+        return solution
+    def _generate_with_huggingface_diverse(self, prompt: str, temperature: float = 0.7) -> str:
+        """다양한 솔루션 생성용 HuggingFace 백엔드 (높은 temperature)"""
+        # 토크나이저 처리
+        inputs = self.tokenizer(prompt, return_tensors='pt', truncation=True, max_length=4096)
+        # attention mask 명시적으로 설정
+        if 'attention_mask' not in inputs:
+            inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])
+        # 디바이스 이동
+        device = getattr(self.model, 'device', 'cuda' if torch.cuda.is_available() else 'cpu')
+        if isinstance(device, str):
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+        else:
+            # 모델이 이미 특정 디바이스에 있는 경우
+            inputs = {k: v.to(next(self.model.parameters()).device) for k, v in inputs.items()}
+        with torch.no_grad():
+            # 메모리 정리
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            # 다양성을 위한 sampling 설정
+            outputs = self.model.generate(
+                inputs['input_ids'],
+                attention_mask=inputs['attention_mask'],
+                max_new_tokens=2048,
+                do_sample=True,          # sampling 활성화
+                temperature=temperature,  # 높은 temperature
+                top_p=0.95,             # 다양성을 위해 top_p 사용
+                pad_token_id=self.tokenizer.eos_token_id,
+                eos_token_id=self.tokenizer.eos_token_id
+            )
+        # 솔루션 추출
+        solution = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        solution = solution[len(prompt):].strip()
+        return solution
+    def _extract_python_code(self, solution: str) -> str:
+        """개선된 Python 코드 추출 (AZR 방식 + 추가 패턴)"""
+        # 1. AZR의 extract_code 함수 먼저 시도
+        try:
+            extracted = extract_code(solution, language="python")
+            if extracted:
+                return extracted
+        except:
+            pass
+        # 2. 다양한 마크다운 패턴 시도
+        patterns = [
+            r'```python\n(.*?)```',           # ```python ... ```
+            r'```\n(.*?)```',                 # ``` ... ```
+            r'```py\n(.*?)```',              # ```py ... ```
+            r'```Python\n(.*?)```',          # ```Python ... ```
+            r'Here is.*?:\n\n```python\n(.*?)```',  # 설명 텍스트 포함
+            r'Here is.*?:\n\n```\n(.*?)```',        # 설명 텍스트 포함
+        ]
+        for pattern in patterns:
+            matches = re.findall(pattern, solution, re.DOTALL | re.IGNORECASE)
+            if matches:
+                return matches[-1].strip()
+        # 3. def로 시작하는 함수 찾기
+        lines = solution.split('\n')
+        code_lines = []
+        in_function = False
+        for line in lines:
+            if line.strip().startswith('def '):
+                in_function = True
+                code_lines.append(line)
+            elif in_function and (line.startswith('    ') or line.strip() == ''):
+                code_lines.append(line)
+            elif in_function and line.strip() and not line.startswith('    '):
+                # 함수 정의 끝
+                break
+        if code_lines:
+            return '\n'.join(code_lines)
+        # 4. 원본 반환
+        return solution
+    def _add_imports_from_prompt(self, solution: str, prompt: str) -> str:
+        """HumanEval 프롬프트에서 import 문을 추출하여 솔루션에 추가 (EvalPlus 방식)"""
+        # 이미 import가 있으면 그대로 반환
+        if 'from typing import' in solution or 'import typing' in solution:
+            return solution
+        # 프롬프트에서 import 문 추출
+        import_lines = []
+        prompt_lines = prompt.split('\n')
+        for line in prompt_lines:
+            stripped = line.strip()
+            # import 문 찾기
+            if (stripped.startswith('from ') and 'import' in stripped) or stripped.startswith('import '):
+                import_lines.append(line)
+            # 함수 정의가 시작되면 중단
+            elif stripped.startswith('def '):
+                break
+        # import가 없으면 원본 반환
+        if not import_lines:
+            return solution
+        # import 추가
+        self.logger.log_info(f"🔧 Adding imports from prompt: {import_lines}")
+        # 솔루션이 이미 import로 시작하는지 확인
+        solution_lines = solution.split('\n')
+        first_non_empty_line = None
+        for i, line in enumerate(solution_lines):
+            if line.strip():
+                first_non_empty_line = i
+                break
+        # import를 맨 앞에 추가
+        if first_non_empty_line is not None:
+            # 기존 import 뒤에 추가하거나 맨 앞에 추가
+            imports_text = '\n'.join(import_lines) + '\n\n'
+            # 첫 번째 비어있지 않은 줄이 import인 경우
+            if solution_lines[first_non_empty_line].strip().startswith(('import ', 'from ')):
+                # 마지막 import 찾기
+                last_import_idx = first_non_empty_line
+                for i in range(first_non_empty_line, len(solution_lines)):
+                    if solution_lines[i].strip() and not solution_lines[i].strip().startswith(('import ', 'from ')):
+                        break
+                    if solution_lines[i].strip().startswith(('import ', 'from ')):
+                        last_import_idx = i
+                # 마지막 import 다음에 추가
+                solution_lines.insert(last_import_idx + 1, '')
+                solution_lines.insert(last_import_idx + 1, '\n'.join(import_lines))
+                return '\n'.join(solution_lines)
+            else:
+                # 맨 앞에 추가
+                return imports_text + solution
+        return imports_text + solution
+    def _fix_function_definition(self, solution: str, prompt: str, problem_id: str = "") -> str:
+        """함수 정의가 누락된 경우 복구 + lpw 스타일 중복 처리"""
+        # lpw 스타일: 프롬프트에서 함수 이름 추출
+        func_def_match = re.search(r'def\s+(\w+)\([^)]*\)(?:\s*->\s*[^:]+)?:', prompt)
+        if not func_def_match:
+            return solution
+        entry_point = func_def_match.group(1)
+        func_def_line = func_def_match.group(0)
+        # HumanEval의 경우 전체 코드를 반환하므로 중복 처리 불필요
+        if 'HumanEval' in problem_id:
+            # 이미 전체 코드가 있으므로 그대로 반환
+            return solution
+        # MBPP의 경우 기존 로직 유지
+        # Case 1: LLM이 전체 함수를 생성한 경우 (lpw 스타일 체크)
+        if (prompt in solution) or (f'def {entry_point}(' in solution):
+            # 함수가 이미 포함되어 있음
+            self.logger.log_info(f"✅ Function definition already present for {entry_point}")
+            return solution
+        # Case 2: 함수 본문만 생성한 경우 - 함수 정의 추가
+        if solution and not solution.startswith('def '):
+            # 함수 정의와 함수 내용을 결합
+            lines = solution.split('\n')
+            fixed_lines = [func_def_line]
+            for line in lines:
+                if line.strip():  # 빈 줄이 아닌 경우
+                    # if __name__ == "__main__": 부분은 함수 밖에 있어야 함
+                    if line.strip().startswith('if __name__'):
+                        # 함수 정의 끝내고 메인 부분 시작
+                        fixed_lines.append('')  # 빈 줄 추가
+                        fixed_lines.append(line.strip())
+                    else:
+                        # 함수 내용은 4칸 인덴테이션
+                        if not line.startswith('    ') and line.strip():
+                            line = '    ' + line.lstrip()
+                        fixed_lines.append(line)
+                else:
+                    fixed_lines.append(line)
+            solution = '\n'.join(fixed_lines)
+            self.logger.log_info(f"🔧 Fixed function definition for {entry_point}")
+        return solution
+    def generate_fallback_solution(self, problem: Dict[str, Any]) -> str:
+        """문제 생성 실패 시 대체 솔루션 생성"""
+        entry_point = problem.get('entry_point', 'solution')
+        problem_description = problem.get('prompt', '')
+        # 문제 유형별 기본 템플릿 (기존 방식)
+        if 'similar_elements' in problem_description:
+            # similar_elements 문제 (Mbpp/2)
+            solution = f"""def {entry_point}(test_tup1, test_tup2):
+    return tuple(set(test_tup1) & set(test_tup2))"""
+        elif 'kth_element' in problem_description:
+            # kth_element 문제
+            solution = f"""def {entry_point}(arr, k):
+    return sorted(arr)[k-1]"""
+        else:
+            # 일반 템플릿
+            solution = f"""def {entry_point}(*args):
+    # TODO: Implement this function
+    return None"""
+        self.logger.log_info(f"🔄 Generated fallback solution for {entry_point}")
+        return solution
+    def validate_syntax(self, solution: str) -> Tuple[bool, Optional[str]]:
+        """솔루션 구문 검증"""
+        try:
+            compile(solution, '<string>', 'exec')
+            return True, None
+        except SyntaxError as e:
+            return False, str(e)
+        except Exception as e:
+            return False, str(e)
+    def extract_function_signature(self, prompt: str) -> Optional[Dict[str, str]]:
+        """프롬프트에서 함수 시그니처 추출"""
+        # def function_name(args) -> return_type: 패턴 매칭
+        pattern = r'def\s+(\w+)\(([^)]*)\)(?:\s*->\s*([^:]+))?:'
+        match = re.search(pattern, prompt)
+        if match:
+            func_name = match.group(1)
+            args = match.group(2)
+            return_type = match.group(3)
+            return {
+                'name': func_name,
+                'args': args.strip(),
+                'return_type': return_type.strip() if return_type else None,
+                'full_signature': match.group(0)
+            }
+        return None
+    def format_solution(self, raw_solution: str, problem: Dict[str, Any]) -> str:
+        """솔루션 형식 정리"""
+        # 기본 정리
+        solution = raw_solution.strip()
+        # 함수 정의 확인 및 수정
+        if not solution.startswith('def '):
+            signature = self.extract_function_signature(problem.get('prompt', ''))
+            if signature:
+                # 함수 정의 추가
+                lines = solution.split('\n')
+                indented_lines = ['    ' + line if line.strip() else line for line in lines]
+                solution = signature['full_signature'] + '\n' + '\n'.join(indented_lines)
+        # 불필요한 설명 텍스트 제거
+        lines = solution.split('\n')
+        code_lines = []
+        in_function = False
+        for line in lines:
+            if line.strip().startswith('def '):
+                in_function = True
+                code_lines.append(line)
+            elif in_function:
+                code_lines.append(line)
+            elif line.strip() and not any(keyword in line.lower() for keyword in
+                                        ['explanation', 'here', 'this function', 'the solution']):
+                code_lines.append(line)
+        return '\n'.join(code_lines).strip()
+    @staticmethod
+    def extract_docstring_from_function(code: str) -> str:
+        """함수 코드에서 docstring을 추출"""
+        import re
+        # 함수 정의 다음에 오는 docstring 패턴 매칭
+        # """...""" 또는 '''...''' 형태
+        docstring_patterns = [
+            r'def\s+\w+\([^)]*\):\s*\n\s*"""(.*?)"""',  # """..."""
+            r'def\s+\w+\([^)]*\):\s*\n\s*\'\'\'(.*?)\'\'\'',  # '''...'''
+        ]
+        for pattern in docstring_patterns:
+            match = re.search(pattern, code, re.DOTALL)
+            if match:
+                docstring = match.group(1).strip()
+                # 여러 줄인 경우 깔끔하게 정리
+                lines = docstring.split('\n')
+                cleaned_lines = []
+                for line in lines:
+                    cleaned_line = line.strip()
+                    if cleaned_line:
+                        cleaned_lines.append(cleaned_line)
+                return ' '.join(cleaned_lines)
+        # docstring이 없는 경우 기본 메시지 반환
+        return "Find the function that produces these outputs from these inputs."
+    def _extract_function_code(self, code: str) -> str:
+        """코드에서 함수 정의와 필요한 import 추출"""
+        import re
+        lines = code.strip().split('\n')
+        import_lines = []
+        func_lines = []
+        in_function = False
+        indent_level = 0
+        # 1. import 문 수집
+        for line in lines:
+            stripped = line.strip()
+            if (stripped.startswith('import ') or stripped.startswith('from ')) and not stripped.startswith('#'):
+                import_lines.append(line)
+        # 2. 함수 정의 찾기
+        for line in lines:
+            if line.strip().startswith('def '):
+                in_function = True
+                func_lines = [line]
+                # 첫 줄의 들여쓰기 레벨 저장
+                indent_level = len(line) - len(line.lstrip())
+            elif in_function:
+                # 빈 줄이거나 같은/더 깊은 들여쓰기면 함수의 일부
+                if not line.strip() or (line.strip() and len(line) - len(line.lstrip()) > indent_level):
+                    func_lines.append(line)
+                else:
+                    # 함수 끝
+                    break
+        # 3. import + function 결합
+        if func_lines:
+            result_lines = import_lines + [''] + func_lines if import_lines else func_lines
+            return '\n'.join(result_lines)
+        else:
+            return code
+    def evaluate_solution(self, problem: Dict[str, Any], solution: str) -> Dict[str, Any]:
+        """LLM 솔루션을 벤치마크 테스트로 평가 (EvalPlus 필수)"""
+        try:
+            # EvalPlus 함수들 임포트 (pip으로 설치된 버전 사용)
+            self.logger.log_info("🔄 Attempting to import EvalPlus...")
+            from evalplus.evaluate import check_correctness
+            from evalplus.gen.util import trusted_exec
+            from evalplus.eval._special_oracle import MBPP_OUTPUT_NOT_NONE_TASKS
+            from evalplus.eval import PASS
+            self.logger.log_info("✅ Using EvalPlus for evaluation")
+        except ImportError as e:
+            # EvalPlus가 없으면 오류로 처리 (fallback 제거)
+            self.logger.log_error(f"❌ EvalPlus is required but not available: {e}")
+            import traceback
+            self.logger.log_error(f"📋 Import traceback: {traceback.format_exc()}")
+            return {
+                'correct': False,
+                'passed_tests': 0,
+                'total_tests': 0,
+                'error': f"EvalPlus import failed: {e}. Please install EvalPlus properly.",
+                'execution_results': [],
+                'base_passed': 0,
+                'plus_passed': 0,
+                'base_total': 0,
+                'plus_total': 0
+            }
+        except Exception as e:
+            self.logger.log_error(f"❌ EvalPlus import failed with unexpected error: {e}")
+            return {
+                'correct': False,
+                'passed_tests': 0,
+                'total_tests': 0,
+                'error': f"EvalPlus import error: {e}",
+                'execution_results': [],
+                'base_passed': 0,
+                'plus_passed': 0,
+                'base_total': 0,
+                'plus_total': 0
+            }
+        result = {
+            'correct': False,
+            'passed_tests': 0,
+            'total_tests': 0,
+            'error': None,
+            'execution_results': [],
+            'base_passed': 0,
+            'plus_passed': 0,
+            'base_total': 0,
+            'plus_total': 0
+        }
+        try:
+            # 1. 함수 정의 추출
+            extracted_code = self._extract_function_code(solution)
+            if not extracted_code:
+                result['error'] = "No function definition found"
+                return result
+            # 2. 데이터셋 타입 결정
+            task_id = problem.get('task_id', '')
+            if task_id.startswith('Mbpp'):
+                dataset = 'mbpp'
+            elif task_id.startswith('HumanEval'):
+                dataset = 'humaneval'
+            else:
+                # 기본값
+                dataset = 'mbpp'
+            # 3. expected outputs 생성 (canonical solution 사용)
+            entry_point = problem.get('entry_point', '')
+            canonical_solution = problem.get('canonical_solution', '')
+            if not canonical_solution:
+                result['error'] = "No canonical_solution found"
+                return result
+            # Expected outputs 계산
+            expected_output = {}
+            # Base tests
+            base_inputs = problem.get('base_input', [])
+            if base_inputs:
+                expected_output['base'], expected_output['base_time'] = trusted_exec(
+                    problem.get('prompt', '') + canonical_solution,
+                    base_inputs,
+                    entry_point,
+                    record_time=True,
+                    output_not_none=entry_point in MBPP_OUTPUT_NOT_NONE_TASKS
+                )
+            # Plus tests
+            plus_inputs = problem.get('plus_input', [])
+            if plus_inputs:
+                expected_output['plus'], expected_output['plus_time'] = trusted_exec(
+                    problem.get('prompt', '') + canonical_solution,
+                    plus_inputs,
+                    entry_point,
+                    record_time=True,
+                    output_not_none=entry_point in MBPP_OUTPUT_NOT_NONE_TASKS
+                )
+            # 4. EvalPlus check_correctness 호출
+            evalplus_result = check_correctness(
+                dataset=dataset,
+                completion_id=0,
+                problem=problem,
+                solution=extracted_code,
+                expected_output=expected_output,
+                base_only=False,  # Plus tests도 실행
+                fast_check=False,  # 모든 테스트 실행
+                identifier=task_id
+            )
+            # 5. 결과 파싱
+            if 'base' in evalplus_result:
+                base_stat, base_details = evalplus_result['base']
+                result['base_total'] = len(base_inputs)
+                if base_stat == PASS:
+                    result['base_passed'] = result['base_total']
+                else:
+                    result['base_passed'] = sum(1 for d in base_details if d) if base_details else 0
+                result['passed_tests'] += result['base_passed']
+                result['total_tests'] += result['base_total']
+            if 'plus' in evalplus_result:
+                plus_stat, plus_details = evalplus_result['plus']
+                result['plus_total'] = len(plus_inputs)
+                if plus_stat == PASS:
+                    result['plus_passed'] = result['plus_total']
+                else:
+                    result['plus_passed'] = sum(1 for d in plus_details if d) if plus_details else 0
+                result['passed_tests'] += result['plus_passed']
+                result['total_tests'] += result['plus_total']
+            # EvalPlus 기준: 모든 테스트 통과해야 correct
+            result['correct'] = (result['passed_tests'] == result['total_tests']) and result['total_tests'] > 0
+            # 에러 메시지 설정
+            if not result['correct']:
+                if base_stat != PASS:
+                    result['error'] = f"Base tests failed: {base_stat}"
+                elif 'plus' in evalplus_result and plus_stat != PASS:
+                    result['error'] = f"Plus tests failed: {plus_stat}"
+            # 로깅
+            self.logger.log_info(f"EvalPlus evaluation for {task_id}:")
+            self.logger.log_info(f"  Base: {result['base_passed']}/{result['base_total']}")
+            self.logger.log_info(f"  Plus: {result['plus_passed']}/{result['plus_total']}")
+            self.logger.log_info(f"  Total: {result['passed_tests']}/{result['total_tests']}")
+            self.logger.log_info(f"  Correct: {result['correct']}")
+        except Exception as e:
+            result['error'] = f"Evaluation failed: {str(e)}"
+            import traceback
+            self.logger.log_info(f"Evaluation traceback: {traceback.format_exc()}")
+        return result
+    @staticmethod
+    def load_model_with_optimizations(model_name: str, device: str,
+                                    config: TestTimeConfig, use_vllm: bool = True, tensor_parallel_size: int = 1) -> Tuple[Any, Any]:
+        """모델과 토크나이저 로드 (AZR 스타일 최적화, VLLM 지원)"""
+        # 토크나이저 로드
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        # VLLM 사용 가능 여부 확인 및 모델 로드
+        if use_vllm and VLLM_AVAILABLE and device.startswith('cuda'):
+            try:
+                # GPU 디바이스 설정 (이미 설정된 CUDA_VISIBLE_DEVICES 우선 사용)
+                import os
+                if 'CUDA_VISIBLE_DEVICES' not in os.environ:
+                    gpu_id = device.split(':')[1] if ':' in device else '0'
+                    os.environ['CUDA_VISIBLE_DEVICES'] = gpu_id
+                else:
+                    # 이미 설정된 CUDA_VISIBLE_DEVICES 사용
+                    gpu_id = os.environ['CUDA_VISIBLE_DEVICES']
+                    print(f"🎯 Using existing CUDA_VISIBLE_DEVICES: {gpu_id}")
+                # VLLM 모델 로드 (Ray Actor 환경에서 메모리 최적화)
+                model = LLM(
+                    model=model_name,
+                    dtype=str(config.torch_dtype).split('.')[-1],  # torch.float16 -> float16
+                    trust_remote_code=True,
+                    gpu_memory_utilization=config.gpu_memory_utilization,
+                    max_model_len=getattr(config, 'max_model_len', 2048),  # 충분한 길이로 증가
+                    tensor_parallel_size=tensor_parallel_size,  # GPU 개수에 맞춤
+                )
+                print(f"✅ VLLM model loaded successfully on GPU {gpu_id} (tensor_parallel_size={tensor_parallel_size})")
+                return model, tokenizer
+            except Exception as e:
+                import traceback
+                print(f"⚠️  VLLM loading failed: {e}")
+                print(f"🔍 Full traceback: {traceback.format_exc()}")
+                print(f"🔄 Falling back to HuggingFace")
+        # HuggingFace 모델 로드 (기존 방식)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=config.torch_dtype,
+            device_map=device if device.startswith('cuda') else None,
+            trust_remote_code=True,
+            attn_implementation="flash_attention_2" if config.use_flash_attention and device.startswith('cuda') else None,
+            use_cache=False,  # 학습용으로 캐시 비활성화
+        )
+        # Gradient checkpointing 활성화
+        # Gradient checkpointing 비활성화 - 추론 시에는 불필요하고 경고만 발생
+        # 학습이 필요한 경우 별도로 활성화해야 함
+        if hasattr(model, 'gradient_checkpointing_disable'):
+            model.gradient_checkpointing_disable()
+        print(f"✅ HuggingFace model loaded successfully")
+        return model, tokenizer

absolute_zero_reasoner/testtime/task_generator.py ADDED Viewed

	@@ -0,0 +1,473 @@

+"""
+TestTime Task Generator
+AZR 추론용 프롬프트 기반 Induction/Deduction/Abduction 태스크 생성
+요구사항 3: "AZR처럼 템플릿을 활용하여 induction, deduction, abduction 문제를 생성"
+"""
+from typing import Dict, List, Any, Optional, Tuple
+import random
+from .config import TestTimeConfig
+from .logger import TestTimeLogger
+# AZR 추론용 프롬프트 직접 사용
+from ..data_construction.prompts import get_code_problem_predictor_prompt
+from .solution_generator import InitialSolutionGenerator
+class TestTimeTaskGenerator:
+    """IPO 트리플에서 3종 태스크 생성"""
+    def __init__(self, config: TestTimeConfig, logger: Optional[TestTimeLogger] = None):
+        self.config = config
+        self.logger = logger or TestTimeLogger()
+        # AZR 추론용 프롬프트 직접 사용 (get_code_problem_predictor_prompt)
+        # 함수 코드 정리용 solution generator 인스턴스 생성
+        self.solution_generator = InitialSolutionGenerator(None, None, config, logger)
+    def generate_tasks(self, ipo_triples: List[Dict[str, Any]],
+                      problem_id: str, round_num: int = 1) -> Dict[str, List[Dict[str, Any]]]:
+        """IPO 트리플에서 3종 태스크 생성 (각 트리플마다 3가지 태스크 모두 생성)"""
+        self.logger.log_info(f"🎯 Generating tasks for {problem_id} from {len(ipo_triples)} triples")
+        # 🔧 수정: 분배 로직 제거, 각 IPO 트리플에서 3가지 태스크 모두 생성
+        induction_tasks = []
+        deduction_tasks = []
+        abduction_tasks = []
+        for i, triple in enumerate(ipo_triples):
+            # 각 트리플에서 induction 태스크 생성
+            induction_task = self._generate_single_induction_task(triple, i, problem_id, round_num)
+            if induction_task:
+                induction_tasks.append(induction_task)
+            # 각 트리플에서 deduction 태스크 생성
+            deduction_task = self._generate_single_deduction_task(triple, i, problem_id, round_num)
+            if deduction_task:
+                deduction_tasks.append(deduction_task)
+            # 각 트리플에서 abduction 태스크 생성
+            abduction_task = self._generate_single_abduction_task(triple, i, problem_id, round_num)
+            if abduction_task:
+                abduction_tasks.append(abduction_task)
+        all_tasks = {
+            'induction': induction_tasks,
+            'deduction': deduction_tasks,
+            'abduction': abduction_tasks
+        }
+        # 로깅
+        task_counts = {k: len(v) for k, v in all_tasks.items()}
+        total_generated = sum(task_counts.values())
+        self.logger.log_info(f"✅ Generated {len(induction_tasks)} induction, {len(deduction_tasks)} deduction, {len(abduction_tasks)} abduction tasks")
+        self.logger.log_task_generation(
+            problem_id,
+            induction_tasks,
+            deduction_tasks,
+            abduction_tasks
+        )
+        return all_tasks
+    def _generate_single_induction_task(self, triple: Dict[str, Any], index: int, problem_id: str, round_num: int) -> Optional[Dict[str, Any]]:
+        """단일 IPO 트리플에서 induction 태스크 생성"""
+        try:
+            # 입력-출력 쌍 준비
+            # 평가를 위해서는 실제 인자(triple['input'])를 사용
+            input_output_pairs = [(triple['input'], triple['actual_output'])]
+            # 표시용으로는 full_input_str 사용
+            display_input = triple.get('full_input_str', triple['input'])
+            # 🔧 수정: clean한 함수 코드만 추출 (test case 제거)
+            clean_program = self._extract_clean_function_code(triple['program'])
+            # 매개변수로 받은 problem_id 사용 (AZR 통합용)
+            original_problem_id = triple.get('id', '').split('_triple_')[0]  # 원본 추출 로직 보존
+            # HumanEval인 경우 특별 처리
+            if 'HumanEval' in problem_id:
+                # 원본 프로그램에서 함수 설명 추출 (doctest 예시가 있는 원본에서)
+                extracted_message = self._extract_function_description(triple['program'])
+                if not extracted_message:
+                    extracted_message = "Find the function that produces these outputs from these inputs."
+            else:
+                # MBPP는 기존 방식 유지
+                extracted_message = InitialSolutionGenerator.extract_docstring_from_function(clean_program)
+            # 사용자 정의: input_output_pairs + message → program
+            # 프롬프트용으로는 display 입력 사용
+            display_pairs = [(display_input, triple['actual_output'])]
+            azr_prompt = get_code_problem_predictor_prompt(
+                problem_type='code_f',
+                snippet=clean_program,  # 🔧 수정: clean한 코드 사용
+                input_output_pairs=display_pairs,
+                message=extracted_message
+            )
+            # AZR 메타데이터 생성
+            source_program_id = triple.get('source_program_id', f'program_{index//3}')
+            ipo_index = triple.get('ipo_index', index % 3)
+            task = {
+                'task_id': f'induction_{index}',
+                'task_type': 'induction',
+                'triple_id': triple['id'],
+                'source_program_id': source_program_id,  # 🆕 추가
+                'ipo_index': ipo_index,                  # 🆕 추가
+                'ipo_triple': {                          # 🆕 추가
+                    'input': triple['input'],
+                    'output': triple['actual_output'],
+                    'program': triple['program']
+                },
+                'prompt': azr_prompt,
+                'expected_solution': clean_program,  # 🔧 수정: clean한 코드 사용
+                'evaluation_data': {
+                    'input_output_pairs': input_output_pairs,  # 평가용으로는 실제 인자 사용
+                    'original_function': triple['program']
+                },
+                # 🆕 AZR 학습용 메타데이터
+                'uid': f"{problem_id}_round_{round_num}_induction_{index}",
+                'ipo_group_id': f"{problem_id}_program_{source_program_id}_ipo_{ipo_index}",
+                'original_problem_id': problem_id,
+                'round': round_num,
+                'extra_info': {'metric': 'code_f'},  # induction task는 code_f
+                'basic_accuracy': 0.0,  # 초기값, evaluation에서 업데이트됨
+                'ground_truth': clean_program  # AZR parquet 형식에서 사용
+            }
+            return task
+        except Exception as e:
+            self.logger.log_error(f"Failed to generate induction task for triple {triple.get('id', 'unknown')}: {e}")
+            return None
+    def _generate_single_deduction_task(self, triple: Dict[str, Any], index: int, problem_id: str, round_num: int) -> Optional[Dict[str, Any]]:
+        """단일 IPO 트리플에서 deduction 태스크 생성"""
+        try:
+            # 매개변수로 받은 problem_id 사용 (AZR 통합용)
+            original_problem_id = triple.get('id', '').split('_triple_')[0]  # 원본 추출 로직 보존
+            # HumanEval인 경우 doctest 예시 제거
+            if 'HumanEval' in original_problem_id:
+                clean_program = self._remove_doctest_examples(triple['program'])
+            else:
+                # MBPP는 기존 방식 유지
+                clean_program = self._extract_clean_function_code(triple['program'])
+            # 사용자 정의: program + input → output
+            azr_prompt = get_code_problem_predictor_prompt(
+                problem_type='code_o',  # 프로그램+입력→출력
+                snippet=clean_program,  # 🔧 수정: clean한 코드 사용
+                input_args=triple['input']
+            )
+            # AZR 메타데이터 생성
+            source_program_id = triple.get('source_program_id', f'program_{index//3}')
+            ipo_index = triple.get('ipo_index', index % 3)
+            task = {
+                'task_id': f'deduction_{index}',
+                'task_type': 'deduction',
+                'triple_id': triple['id'],
+                'source_program_id': source_program_id,  # 🆕 추가
+                'ipo_index': ipo_index,                  # 🆕 추가
+                'ipo_triple': {                          # 🆕 추가
+                    'input': triple['input'],
+                    'output': triple['actual_output'],
+                    'program': triple['program']
+                },
+                'prompt': azr_prompt,
+                'expected_solution': triple['actual_output'],  # 🔧 수정: expected_solution으로 통일
+                'evaluation_data': {
+                    'function_code': clean_program,  # 🔧 수정: clean한 코드 사용 (complete_pipeline과 일치)
+                    'test_input': triple['input'],  # 🔧 수정: complete_pipeline과 일치
+                    'original_function': triple['program']
+                },
+                # 🆕 AZR 학습용 메타데이터
+                'uid': f"{problem_id}_round_{round_num}_deduction_{index}",
+                'ipo_group_id': f"{problem_id}_program_{source_program_id}_ipo_{ipo_index}",
+                'original_problem_id': problem_id,
+                'round': round_num,
+                'extra_info': {'metric': 'code_o'},  # deduction task는 code_o
+                'basic_accuracy': 0.0,  # 초기값, evaluation에서 업데이트됨
+                'ground_truth': triple['actual_output']  # AZR parquet 형식에서 ���용
+            }
+            return task
+        except Exception as e:
+            self.logger.log_error(f"Failed to generate deduction task for triple {triple.get('id', 'unknown')}: {e}")
+            return None
+    def _generate_single_abduction_task(self, triple: Dict[str, Any], index: int, problem_id: str, round_num: int) -> Optional[Dict[str, Any]]:
+        """단일 IPO 트리플에서 abduction 태스크 생성"""
+        try:
+            # 매개변수로 받은 problem_id 사용 (AZR 통합용)
+            original_problem_id = triple.get('id', '').split('_triple_')[0]  # 원본 추출 로직 보존
+            # HumanEval인 경우 doctest 예시 제거
+            if 'HumanEval' in original_problem_id:
+                clean_program = self._remove_doctest_examples(triple['program'])
+            else:
+                # MBPP는 기존 방식 유지
+                clean_program = self._extract_clean_function_code(triple['program'])
+            # 사용자 정의: program + output → input
+            azr_prompt = get_code_problem_predictor_prompt(
+                problem_type='code_i',  # 프로그램+출력→입력
+                snippet=clean_program,  # 🔧 수정: clean한 코드 사용
+                output=triple['actual_output']  # 🔧 수정: output 파라미터 사용
+            )
+            # AZR 메타데이터 생성
+            source_program_id = triple.get('source_program_id', f'program_{index//3}')
+            ipo_index = triple.get('ipo_index', index % 3)
+            task = {
+                'task_id': f'abduction_{index}',
+                'task_type': 'abduction',
+                'triple_id': triple['id'],
+                'source_program_id': source_program_id,  # 🆕 추가
+                'ipo_index': ipo_index,                  # 🆕 추가
+                'ipo_triple': {                          # 🆕 추가
+                    'input': triple['input'],
+                    'output': triple['actual_output'],
+                    'program': triple['program']
+                },
+                'prompt': azr_prompt,
+                'expected_solution': triple.get('full_input_str', triple['input']),  # 🔧 수정: 전체 함수 호출 사용
+                'evaluation_data': {
+                    'function_code': clean_program,  # 🔧 수정: clean한 코드 사용 (complete_pipeline과 일치)
+                    'expected_output': triple['actual_output'],  # 🔧 수정: complete_pipeline과 일치
+                    'original_function': triple['program']
+                },
+                # 🆕 AZR 학습용 메타데이터
+                'uid': f"{problem_id}_round_{round_num}_abduction_{index}",
+                'ipo_group_id': f"{problem_id}_program_{source_program_id}_ipo_{ipo_index}",
+                'original_problem_id': problem_id,
+                'round': round_num,
+                'extra_info': {'metric': 'code_i'},  # abduction task는 code_i
+                'basic_accuracy': 0.0,  # 초기값, evaluation에서 업데이트됨
+                'ground_truth': triple.get('full_input_str', triple['input'])  # AZR parquet 형식에서 사용
+            }
+            return task
+        except Exception as e:
+            self.logger.log_error(f"Failed to generate abduction task for triple {triple.get('id', 'unknown')}: {e}")
+            return None
+    def generate_induction_tasks(self, ipo_triples: List[Dict[str, Any]],
+                                count: int) -> List[Dict[str, Any]]:
+        """Induction 태스크: 입력-출력 쌍에서 프로그램 추론 (사용자 정의 유지)"""
+        tasks = []
+        selected_triples = random.sample(ipo_triples, min(count, len(ipo_triples)))
+        for i, triple in enumerate(selected_triples):
+            # 입력-출력 쌍 준비
+            input_output_pairs = [(triple['input'], triple['actual_output'])]
+            # 🔧 수정: clean한 함수 코드만 추출 (test case 제거)
+            clean_program = self._extract_clean_function_code(triple['program'])
+            # LLM이 생성한 함수에서 docstring 추출해서 message로 사용
+            extracted_message = InitialSolutionGenerator.extract_docstring_from_function(clean_program)
+            # 사용자 정의: input_output_pairs + message → program
+            azr_prompt = get_code_problem_predictor_prompt(
+                problem_type='code_f',
+                snippet=clean_program,  # 🔧 수정: clean한 코드 사용
+                input_output_pairs=input_output_pairs,
+                message=extracted_message
+            )
+            task = {
+                'task_id': f'induction_{i}',
+                'task_type': 'induction',
+                'triple_id': triple['id'],
+                'prompt': azr_prompt,
+                'expected_solution': clean_program,  # 🔧 수정: clean한 코드 사용
+                'evaluation_data': {
+                    'input_output_pairs': input_output_pairs,
+                    'original_function': triple['program']
+                }
+            }
+            tasks.append(task)
+        return tasks
+    def generate_deduction_tasks(self, ipo_triples: List[Dict[str, Any]],
+                               count: int) -> List[Dict[str, Any]]:
+        """Deduction 태스크: 프로그램+입력에서 출력 예측 (사용자 정의에 맞게 수정)"""
+        tasks = []
+        selected_triples = random.sample(ipo_triples, min(count, len(ipo_triples)))
+        for i, triple in enumerate(selected_triples):
+            # 🔧 수정: clean한 함수 코드만 추출 (test case 제거)
+            clean_program = self._extract_clean_function_code(triple['program'])
+            # 사용자 정의: program + input → output
+            azr_prompt = get_code_problem_predictor_prompt(
+                problem_type='code_o',  # 프로그램+입력→출력
+                snippet=clean_program,  # 🔧 수정: clean한 코드 사용
+                input_args=triple['input']
+            )
+            task = {
+                'task_id': f'deduction_{i}',
+                'task_type': 'deduction',
+                'triple_id': triple['id'],
+                'prompt': azr_prompt,
+                'expected_solution': triple['actual_output'],
+                'evaluation_data': {
+                    'function_code': clean_program,  # 🔧 수정: clean한 코드 사용
+                    'test_input': triple['input']
+                }
+            }
+            tasks.append(task)
+        return tasks
+    def generate_abduction_tasks(self, ipo_triples: List[Dict[str, Any]],
+                               count: int) -> List[Dict[str, Any]]:
+        """Abduction 태스크: 프로그램+출력에서 입력 예측 (사용자 정의에 맞게 수정)"""
+        tasks = []
+        selected_triples = random.sample(ipo_triples, min(count, len(ipo_triples)))
+        for i, triple in enumerate(selected_triples):
+            # 🔧 수정: clean한 함수 코드만 추출 (test case 제거)
+            clean_program = self._extract_clean_function_code(triple['program'])
+            # 사용자 정의: program + output → input
+            azr_prompt = get_code_problem_predictor_prompt(
+                problem_type='code_i',  # 프로그램+출력→입력
+                snippet=clean_program,  # 🔧 수정: clean한 코드 사용
+                output=triple['actual_output']
+            )
+            task = {
+                'task_id': f'abduction_{i}',
+                'task_type': 'abduction',
+                'triple_id': triple['id'],
+                'prompt': azr_prompt,
+                'expected_solution': triple.get('full_input_str', triple['input']),  # 🔧 수정: 전체 함수 호출 사용
+                'evaluation_data': {
+                    'function_code': clean_program,  # 🔧 수정: clean한 코드 사용
+                    'expected_output': triple['actual_output']
+                }
+            }
+            tasks.append(task)
+        return tasks
+    def _extract_clean_function_code(self, program_with_tests: str) -> str:
+        """🔧 수정: 프로그램에서 test case와 assert문을 제거하고 순수한 함수 코드만 추출"""
+        # solution_generator의 _extract_function_code 메서드 사용
+        clean_code = self.solution_generator._extract_function_code(program_with_tests)
+        # 로깅 (디버깅용)
+        if "assert" in program_with_tests or "# Test" in program_with_tests:
+            self.logger.log_info("🧹 Cleaned function code (removed test cases)")
+        return clean_code
+    def get_task_statistics(self, all_tasks: Dict[str, List[Dict[str, Any]]]) -> Dict[str, Any]:
+        """태스크 생성 통계"""
+        stats = {
+            'total_tasks': sum(len(tasks) for tasks in all_tasks.values()),
+            'tasks_by_type': {task_type: len(tasks) for task_type, tasks in all_tasks.items()},
+            'task_types': list(all_tasks.keys())
+        }
+        return stats
+    def _remove_doctest_examples(self, code: str) -> str:
+        """HumanEval 코드에서 doctest 예시 제거"""
+        import re
+        lines = code.split('\n')
+        result_lines = []
+        in_docstring = False
+        docstring_indent = 0
+        skip_next = False
+        for line in lines:
+            stripped = line.strip()
+            # docstring 시작/끝 감지
+            if '"""' in line or "'''" in line:
+                if not in_docstring:
+                    in_docstring = True
+                    docstring_indent = len(line) - len(line.lstrip())
+                    result_lines.append(line)
+                else:
+                    in_docstring = False
+                    result_lines.append(line)
+                continue
+            # doctest 예시 라인 건너뛰기
+            if in_docstring:
+                if stripped.startswith('>>>'):
+                    skip_next = True  # 다음 라인(결과)도 건너뛰기
+                    continue
+                elif skip_next and stripped and not stripped.startswith('>>>'):
+                    skip_next = False
+                    continue
+                else:
+                    skip_next = False
+            result_lines.append(line)
+        return '\n'.join(result_lines)
+    def _extract_function_description(self, code: str) -> str:
+        """docstring에서 함수 설명 추출 (예시 제외)"""
+        import re
+        # 여러 형태의 docstring 매칭
+        patterns = [
+            r'"""(.*?)"""', # triple double quotes
+            r"'''(.*?)'''", # triple single quotes
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, code, re.DOTALL)
+            if match:
+                description = match.group(1).strip()
+                # 예시 전까지의 모든 설명 추출
+                result_lines = []
+                lines = description.split('\n')
+                for line in lines:
+                    cleaned_line = line.strip()
+                    # >>> 예시가 시작되면 중단
+                    if cleaned_line.startswith('>>>'):
+                        break
+                    # 빈 줄이 아니고 예시가 아닌 경우 추가
+                    if cleaned_line:
+                        result_lines.append(cleaned_line)
+                # 모든 설명 라인을 공백으로 연결
+                if result_lines:
+                    return ' '.join(result_lines)
+        return ""

absolute_zero_reasoner/trainer/__init__.py ADDED Viewed

File without changes

absolute_zero_reasoner/trainer/ppo/__init__.py ADDED Viewed

File without changes

absolute_zero_reasoner/trainer/ppo/azr_ray_trainer.py ADDED Viewed

The diff for this file is too large to render. See raw diff

absolute_zero_reasoner/trainer/ppo/reason_rl_ray_trainer.py ADDED Viewed

	@@ -0,0 +1,768 @@

+import uuid
+from typing import Optional
+from copy import deepcopy
+from collections import defaultdict
+from omegaconf import OmegaConf, open_dict
+import torch
+import numpy as np
+from torch.utils.data import Dataset, Sampler
+from verl.trainer.ppo.ray_trainer import RayPPOTrainer, apply_kl_penalty, compute_advantage, reduce_metrics, compute_data_metrics, compute_timing_metrics, AdvantageEstimator, compute_response_mask
+from verl.utils.debug import marked_timer
+from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto, DataProto
+from verl.utils.dataset.rl_dataset import collate_fn
+from verl import DataProto
+from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
+from verl.single_controller.ray import RayWorkerGroup
+from verl.trainer.ppo import core_algos
+from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn
+from verl.trainer.ppo.ray_trainer import Role, WorkerType, ResourcePoolManager
+from verl.utils.tracking import ValidationGenerationsLogger
+from absolute_zero_reasoner.utils.dataset.rl_dataset import RLHFDataset
+class ReasonRLRayPPOTrainer(RayPPOTrainer):
+    def __init__(
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        train_dataset: Optional[Dataset] = None,
+        val_dataset: Optional[Dataset] = None,
+        collate_fn=None,
+        train_sampler: Optional[Sampler] = None,
+        device_name="cuda",
+    ):
+        """
+        Initialize distributed PPO trainer with Ray backend.
+        Note that this trainer runs on the driver process on a single CPU/GPU node.
+        Args:
+            config: Configuration object containing training parameters.
+            tokenizer: Tokenizer used for encoding and decoding text.
+            role_worker_mapping (dict[Role, WorkerType]): Mapping from roles to worker classes.
+            resource_pool_manager (ResourcePoolManager): Manager for Ray resource pools.
+            ray_worker_group_cls (RayWorkerGroup, optional): Class for Ray worker groups. Defaults to RayWorkerGroup.
+            processor: Optional data processor, used for multimodal data
+            reward_fn: Function for computing rewards during training.
+            val_reward_fn: Function for computing rewards during validation.
+            train_dataset (Optional[Dataset], optional): Training dataset. Defaults to None.
+            val_dataset (Optional[Dataset], optional): Validation dataset. Defaults to None.
+            collate_fn: Function to collate data samples into batches.
+            train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None.
+            device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to "cuda".
+        """
+        # Store the tokenizer for text processing
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.config = config
+        self.reward_fn = reward_fn
+        self.val_reward_fn = val_reward_fn
+        self.hybrid_engine = config.actor_rollout_ref.hybrid_engine
+        assert self.hybrid_engine, "Currently, only support hybrid engine"
+        if self.hybrid_engine:
+            assert Role.ActorRollout in role_worker_mapping, f"{role_worker_mapping.keys()=}"
+        self.role_worker_mapping = role_worker_mapping
+        self.resource_pool_manager = resource_pool_manager
+        self.use_reference_policy = Role.RefPolicy in role_worker_mapping
+        self.use_rm = Role.RewardModel in role_worker_mapping
+        self.ray_worker_group_cls = ray_worker_group_cls
+        self.device_name = device_name
+        self.validation_generations_logger = ValidationGenerationsLogger()
+        # if ref_in_actor is True, the reference policy will be actor without lora applied
+        self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0
+        # define in-reward KL control
+        # kl loss control currently not suppoorted
+        if config.algorithm.use_kl_in_reward:
+            self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl)
+        if self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
+            self.use_critic = True
+        elif self.config.algorithm.adv_estimator in [
+            AdvantageEstimator.GRPO,
+            AdvantageEstimator.GRPO_PASSK,
+            AdvantageEstimator.REINFORCE_PLUS_PLUS,
+            AdvantageEstimator.REMAX,
+            AdvantageEstimator.RLOO,
+            AdvantageEstimator.OPO,
+            AdvantageEstimator.REINFORCE_PLUS_PLUS_BASELINE,
+        ]:
+            self.use_critic = False
+        else:
+            raise NotImplementedError
+        self._validate_config()
+        self._create_dataloader()
+    def _validate_config(self):
+        config = self.config
+        # number of GPUs total
+        n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes
+        if config.actor_rollout_ref.actor.strategy == "megatron":
+            model_parallel_size = config.actor_rollout_ref.actor.megatron.tensor_model_parallel_size * config.actor_rollout_ref.actor.megatron.pipeline_model_parallel_size
+            assert n_gpus % (model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size) == 0, f"n_gpus ({n_gpus}) must be divisible by model_parallel_size ({model_parallel_size}) times context_parallel_size ({config.actor_rollout_ref.actor.megatron.context_parallel_size})"
+            megatron_dp = n_gpus // (model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size)
+            minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu
+        else:
+            minimal_bsz = n_gpus
+        # 1. Check total batch size for data correctness
+        real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n
+        assert real_train_batch_size % minimal_bsz == 0, f"real_train_batch_size ({real_train_batch_size}) must be divisible by total n_gpus ({n_gpus})."
+        # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu"
+        # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu".
+        def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
+            settings = {
+                "actor_rollout_ref.actor": "micro_batch_size",
+                "critic": "micro_batch_size",
+                "reward_model": "micro_batch_size",
+                "actor_rollout_ref.ref": "log_prob_micro_batch_size",
+                "actor_rollout_ref.rollout": "log_prob_micro_batch_size",
+            }
+            if name in settings:
+                param = settings[name]
+                param_per_gpu = f"{param}_per_gpu"
+                if mbs is None and mbs_per_gpu is None:
+                    raise ValueError(f"[{name}] Please set at least one of '{name}.{param}' or '{name}.{param_per_gpu}'.")
+                if mbs is not None and mbs_per_gpu is not None:
+                    raise ValueError(f"[{name}] You have set both '{name}.{param}' AND '{name}.{param_per_gpu}'. Please remove '{name}.{param}' because only '*_{param_per_gpu}'" + "is supported (the former is deprecated).")
+        if not config.actor_rollout_ref.actor.use_dynamic_bsz:
+            # actor: ppo_micro_batch_size vs. ppo_micro_batch_size_per_gpu
+            check_mutually_exclusive(
+                config.actor_rollout_ref.actor.ppo_micro_batch_size,
+                config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu,
+                "actor_rollout_ref.actor",
+            )
+            if self.use_reference_policy:
+                # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
+                check_mutually_exclusive(
+                    config.actor_rollout_ref.ref.log_prob_micro_batch_size,
+                    config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu,
+                    "actor_rollout_ref.ref",
+                )
+            #  The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
+            check_mutually_exclusive(
+                config.actor_rollout_ref.rollout.log_prob_micro_batch_size,
+                config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu,
+                "actor_rollout_ref.rollout",
+            )
+        if self.use_critic and not config.critic.use_dynamic_bsz:
+            # Check for critic micro-batch size conflicts
+            check_mutually_exclusive(config.critic.ppo_micro_batch_size, config.critic.ppo_micro_batch_size_per_gpu, "critic")
+        # Check for reward model micro-batch size conflicts
+        if config.reward_model.enable and not config.reward_model.use_dynamic_bsz:
+            check_mutually_exclusive(config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model")
+        # Actor
+        # check if train_batch_size is larger than ppo_mini_batch_size
+        # if NOT dynamic_bsz, we must ensure:
+        #    ppo_mini_batch_size is divisible by ppo_micro_batch_size
+        #    ppo_micro_batch_size * sequence_parallel_size >= n_gpus
+        if not config.actor_rollout_ref.actor.use_dynamic_bsz:
+            # assert config.data.train_batch_size >= config.actor_rollout_ref.actor.ppo_mini_batch_size
+            sp_size = config.actor_rollout_ref.actor.get("ulysses_sequence_parallel_size", 1)
+            if config.actor_rollout_ref.actor.ppo_micro_batch_size is not None:
+                assert config.actor_rollout_ref.actor.ppo_mini_batch_size % config.actor_rollout_ref.actor.ppo_micro_batch_size == 0
+                assert config.actor_rollout_ref.actor.ppo_micro_batch_size * sp_size >= n_gpus
+        assert config.actor_rollout_ref.actor.loss_agg_mode in [
+            "token-mean",
+            "seq-mean-token-sum",
+            "seq-mean-token-mean",
+            "seq-mean-token-sum-norm",
+        ], f"Invalid loss_agg_mode: {config.actor_rollout_ref.actor.loss_agg_mode}"
+        if config.algorithm.use_kl_in_reward and config.actor_rollout_ref.actor.use_kl_loss:
+            print("NOTICE: You have both enabled in-reward kl and kl loss.")
+        # critic
+        if self.use_critic and not config.critic.use_dynamic_bsz:
+            assert config.data.train_batch_size >= config.critic.ppo_mini_batch_size
+            sp_size = config.critic.get("ulysses_sequence_parallel_size", 1)
+            if config.critic.ppo_micro_batch_size is not None:
+                assert config.critic.ppo_mini_batch_size % config.critic.ppo_micro_batch_size == 0
+                assert config.critic.ppo_micro_batch_size * sp_size >= n_gpus
+        # Check if use_remove_padding is enabled when using sequence parallelism for fsdp
+        if config.actor_rollout_ref.actor.strategy == "fsdp" and (config.actor_rollout_ref.actor.get("ulysses_sequence_parallel_size", 1) > 1 or config.actor_rollout_ref.ref.get("ulysses_sequence_parallel_size", 1) > 1):
+            assert config.actor_rollout_ref.model.use_remove_padding, "When using sequence parallelism for actor/ref policy, you must enable `use_remove_padding`."
+        if self.use_critic and config.critic.strategy == "fsdp":
+            if config.critic.get("ulysses_sequence_parallel_size", 1) > 1:
+                assert config.critic.model.use_remove_padding, "When using sequence parallelism for critic, you must enable `use_remove_padding`."
+        if config.data.get("val_batch_size", None) is not None:
+            print("WARNING: val_batch_size is deprecated." + " Validation datasets are sent to inference engines as a whole batch," + " which will schedule the memory themselves.")
+        # check eval config
+        if config.actor_rollout_ref.rollout.val_kwargs.do_sample:
+            assert config.actor_rollout_ref.rollout.temperature > 0, "validation gen temperature should be greater than 0 when enabling do_sample"
+        # check multi_turn with tool config
+        if config.actor_rollout_ref.rollout.multi_turn.enable:
+            assert config.actor_rollout_ref.rollout.multi_turn.tool_config_path is not None or config.actor_rollout_ref.rollout.multi_turn.interaction_config_path is not None, "tool_config_path or interaction_config_path must be set when enabling multi_turn with tool, due to no role-playing support"
+            assert config.algorithm.adv_estimator in [AdvantageEstimator.GRPO], "only GRPO is tested for multi-turn with tool"
+        print("[validate_config] All configuration checks passed successfully!")
+    def _create_dataloader(self):
+        """
+        Changed the prompt length of validation set to have another prompt length.
+        Create the train and val dataloader.
+        """
+        from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+        self.train_dataset = RLHFDataset(parquet_files=self.config.data.train_files,
+                                         tokenizer=self.tokenizer,
+                                         prompt_key=self.config.data.prompt_key,
+                                         max_prompt_length=self.config.data.max_prompt_length,
+                                         filter_prompts=True,
+                                         return_raw_chat=self.config.data.get('return_raw_chat', False),
+                                         truncation='error',
+                                         extra_source_key="train")
+        # use sampler for better ckpt resume
+        if self.config.data.shuffle:
+            train_dataloader_generator = torch.Generator()
+            train_dataloader_generator.manual_seed(self.config.data.get('seed', 1))
+            sampler = RandomSampler(data_source=self.train_dataset, generator=train_dataloader_generator)
+        else:
+            sampler = SequentialSampler(data_source=self.train_dataset)
+        self.train_dataloader = DataLoader(dataset=self.train_dataset,
+                                           batch_size=self.config.data.train_batch_size,
+                                           drop_last=True,
+                                           collate_fn=collate_fn,
+                                           sampler=sampler)
+        self.val_dataset = RLHFDataset(parquet_files=self.config.data.val_files,
+                                       tokenizer=self.tokenizer,
+                                       prompt_key=self.config.data.prompt_key,
+                                       max_prompt_length=self.config.data.max_prompt_length,
+                                       filter_prompts=True,
+                                       return_raw_chat=self.config.data.get('return_raw_chat', False),
+                                       truncation='error',
+                                       extra_source_key="val")
+        self.val_dataloader = DataLoader(dataset=self.val_dataset,
+                                         batch_size=len(self.val_dataset),
+                                         shuffle=True,
+                                         drop_last=True,
+                                         collate_fn=collate_fn)
+        assert len(self.train_dataloader) >= 1
+        assert len(self.val_dataloader) >= 1
+        print(f'Size of train dataloader: {len(self.train_dataloader)}')
+        print(f'Size of val dataloader: {len(self.val_dataloader)}')
+        # inject total_training_steps to actor/critic optim_config. This is hacky.
+        total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
+        if self.config.trainer.total_training_steps is not None:
+            total_training_steps = self.config.trainer.total_training_steps
+        self.total_training_steps = total_training_steps
+        print(f'Total training steps: {self.total_training_steps}')
+        OmegaConf.set_struct(self.config, True)
+        with open_dict(self.config):
+            self.config.actor_rollout_ref.actor.optim.total_training_steps = total_training_steps
+            # Only set critic total_training_steps if critic is actually used
+            if self.use_critic:
+                self.config.critic.optim.total_training_steps = total_training_steps
+    def _validate(self, do_sample: bool = False):
+        """
+        The validation loop of PPO.
+        The only difference is logging more metrics.
+        """
+        from collections import defaultdict
+        reward_tensor_lst = []
+        data_source_lst = []
+        # Lists to collect samples for the table
+        sample_inputs = []
+        sample_outputs = []
+        sample_scores = []
+        all_eval_metrics = defaultdict(list)
+        for test_data in self.val_dataloader:
+            test_batch = DataProto.from_single_dict(test_data)
+            # we only do validation on rule-based rm
+            if self.config.reward_model.enable and test_batch[0].non_tensor_batch['reward_model']['style'] == 'model':
+                return {}
+            # Store original inputs
+            input_ids = test_batch.batch['input_ids']
+            input_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
+            sample_inputs.extend(input_texts)
+            batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+            non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+            if "multi_modal_data" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("multi_modal_data")
+            if "raw_prompt" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("raw_prompt")
+            if "tools_kwargs" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("tools_kwargs")
+            if "interaction_kwargs" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("interaction_kwargs")
+            test_gen_batch = test_batch.pop(
+                batch_keys=batch_keys_to_pop,
+                non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+            )
+            test_gen_batch.meta_info = {
+                'eos_token_id': self.tokenizer.eos_token_id,
+                'pad_token_id': self.tokenizer.pad_token_id,
+                'recompute_log_prob': False,
+                'do_sample': do_sample,
+                'validate': True,
+            }
+            # pad to be divisible by dp_size
+            size_divisor = self.actor_rollout_wg.world_size if not self.async_rollout_mode else self.config.actor_rollout_ref.rollout.agent.num_workers
+            test_gen_batch_padded, pad_size = pad_dataproto_to_divisor(test_gen_batch, size_divisor)
+            if not self.async_rollout_mode:
+                test_output_gen_batch_padded = self.actor_rollout_wg.generate_sequences(test_gen_batch_padded)
+            else:
+                test_output_gen_batch_padded = self.async_rollout_manager.generate_sequences(test_gen_batch_padded)
+            # unpad
+            test_output_gen_batch = unpad_dataproto(test_output_gen_batch_padded, pad_size=pad_size)
+            print('validation generation end')
+            # Store generated outputs
+            output_ids = test_output_gen_batch.batch["responses"]
+            output_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
+            sample_outputs.extend(output_texts)
+            test_batch = test_batch.union(test_output_gen_batch)
+            # evaluate using reward_function
+            reward_tensor, eval_metrics = self.val_reward_fn(test_batch)
+            for k, v in eval_metrics.items():
+                all_eval_metrics[k].append(v)
+            # Store scores
+            scores = reward_tensor.sum(-1).cpu().tolist()
+            sample_scores.extend(scores)
+            reward_tensor_lst.append(reward_tensor)
+            data_source_lst.append(test_batch.non_tensor_batch.get('data_source', ['unknown'] * reward_tensor.shape[0]))
+        self._maybe_log_val_generations(inputs=sample_inputs, outputs=sample_outputs, scores=sample_scores)
+        reward_tensor = torch.cat(reward_tensor_lst, dim=0).sum(-1).cpu()  # (batch_size,)
+        data_sources = np.concatenate(data_source_lst, axis=0)
+        # evaluate test_score based on data source
+        data_source_reward = {}
+        for i in range(reward_tensor.shape[0]):
+            data_source = data_sources[i]
+            if data_source not in data_source_reward:
+                data_source_reward[data_source] = []
+            data_source_reward[data_source].append(reward_tensor[i].item())
+        metric_dict = {}
+        for data_source, rewards in data_source_reward.items():
+            metric_dict[f'val/test_score/{data_source}'] = np.mean(rewards)
+        for k, v in all_eval_metrics.items():
+            metric_dict[k] = np.mean(v)
+        if self.config.eval.get('save_generations', False):
+            import json
+            with open(f'{self.config.trainer.experiment_name}_generations_{self.global_steps}.json', 'w') as f:
+                json.dump({
+                    'inputs': sample_inputs,
+                    'outputs': sample_outputs,
+                    'scores': sample_scores
+                }, f)
+        return metric_dict
+    def fit(self):
+        """
+        The training loop of PPO.
+        The driver process only need to call the compute functions of the worker group through RPC to construct the PPO dataflow.
+        The light-weight advantage computation is done on the driver process.
+        The only difference is logging more metrics.
+        """
+        from absolute_zero_reasoner.utils.tracking import ReasonRLTracking
+        from absolute_zero_reasoner.utils.logging_utils.stdout import PrettyPrinter as pp
+        from omegaconf import OmegaConf
+        # Display training setup header
+        pp.section_header("Training Setup")
+        logger = ReasonRLTracking(
+            project_name=self.config.trainer.project_name,
+            experiment_name=self.config.trainer.experiment_name,
+            default_backend=self.config.trainer.logger,
+            config=OmegaConf.to_container(self.config, resolve=True),
+            tags=self.config.trainer.wandb_tags,
+            resume="must" if self.config.trainer.resume_mode == 'auto' and \
+                self.config.trainer.wandb_run_id is not None else False,  # Add resume flag
+            run_id=self.config.trainer.wandb_run_id \
+                if self.config.trainer.wandb_run_id is not None else None  # Pass existing run ID
+        )
+        pp.status("Config", f"Project: {self.config.trainer.project_name}, Experiment: {self.config.trainer.experiment_name}", "info")
+        pp.status("Algorithm", f"Using {self.config.algorithm.adv_estimator} advantage estimator", "info")
+        pp.status("Setup", f"Critic enabled: {self.use_critic}, Reference policy: {self.use_reference_policy}", "info")
+        self.global_steps = 0
+        # load checkpoint before doing anything
+        pp.status("Checkpoint", "Loading checkpoint if available...", "info")
+        self._load_checkpoint()
+        # base model chat template
+        if self.config.actor_rollout_ref.model.pretrained_tokenizer:
+            self.tokenizer.chat_template = "{%- for message in messages -%}{{- '\n' if not loop.first -}}{{- message['content'] -}}{%- endfor -%}"
+        # perform validation before training
+        # currently, we only support validation using the reward_function.
+        if self.val_reward_fn is not None and self.config.trainer.get('val_before_train', True) and self.global_steps == 0:
+            pp.section_header("Initial Validation")
+            pp.status("Validation", "Running initial validation...", "info")
+            val_metrics = self._validate(do_sample=self.config.eval.do_sample)
+            # Convert metrics to table format
+            metrics_table = []
+            for k, v in val_metrics.items():
+                metrics_table.append([k, f"{v:.4f}" if isinstance(v, float) else v])
+            pp.table(["Metric", "Value"], metrics_table, "Initial Validation Results")
+            logger.log(data=val_metrics, step=self.global_steps)
+            # save val metrics to model path
+            if self.config.eval.get('log_to_model_path', False):
+                import json
+                import os
+                with open(os.path.join(self.config.actor_rollout_ref.model.path, 'math_metrics.json'), 'w') as f:
+                    json.dump(val_metrics, f)
+            if self.config.trainer.get('val_only', False):
+                pp.status("Training", "Validation only mode, exiting", "success")
+                return
+        # we start from step 1
+        self.global_steps += 1
+        last_val_metrics = None
+        self.max_steps_duration = 0
+        pp.section_header("Starting Training")
+        pp.status("Training", f"Starting training for {self.config.trainer.total_epochs} epochs ({total_steps} steps)", "info")
+        for epoch in range(self.config.trainer.total_epochs):
+            pp.status("Epoch", f"Starting epoch {epoch+1}/{self.config.trainer.total_epochs}", "info")
+            for batch_idx, batch_dict in enumerate(self.train_dataloader):
+                do_profile = self.global_steps in self.config.trainer.profile_steps if self.config.trainer.profile_steps is not None else False
+                if do_profile:
+                    self.actor_rollout_wg.start_profile()
+                    if self.use_reference_policy:
+                        self.ref_policy_wg.start_profile()
+                    if self.use_critic:
+                        self.critic_wg.start_profile()
+                    if self.use_rm:
+                        self.rm_wg.start_profile()
+                metrics = {}
+                timing_raw = {}
+                batch: DataProto = DataProto.from_single_dict(batch_dict)
+                # pop those keys for generation
+                batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+                non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+                if "multi_modal_data" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("multi_modal_data")
+                if "raw_prompt" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("raw_prompt")
+                if "tools_kwargs" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("tools_kwargs")
+                if "interaction_kwargs" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("interaction_kwargs")
+                gen_batch = batch.pop(
+                    batch_keys=batch_keys_to_pop,
+                    non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+                )
+                is_last_step = self.global_steps >= self.total_training_steps
+                with marked_timer("step", timing_raw):
+                    # generate a batch
+                    with marked_timer("gen", timing_raw, color="red"):
+                        pp.status("Step", f"Generating sequences for batch {batch_idx+1}", "info")
+                        gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
+                    if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
+                        with marked_timer("gen_max", timing_raw, color="purple"):
+                            gen_baseline_batch = deepcopy(gen_batch)
+                            gen_baseline_batch.meta_info["do_sample"] = False
+                            gen_baseline_output = self.actor_rollout_wg.generate_sequences(gen_baseline_batch)
+                            batch = batch.union(gen_baseline_output)
+                            reward_baseline_tensor, _ = self.reward_fn(batch)
+                            reward_baseline_tensor = reward_baseline_tensor.sum(dim=-1)
+                            batch.pop(batch_keys=list(gen_baseline_output.batch.keys()))
+                            batch.batch["reward_baselines"] = reward_baseline_tensor
+                            del gen_baseline_batch, gen_baseline_output
+                    pp.status("Processing", "Preparing batch with UUIDs", "info")
+                    batch.non_tensor_batch['uid'] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))],
+                                                             dtype=object)
+                    # repeat to align with repeated responses in rollout
+                    batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+                    batch = batch.union(gen_batch_output)
+                    batch.batch["response_mask"] = compute_response_mask(batch)
+                    pp.status("Processing", "Balancing batch across ranks", "info")
+                    # Balance the number of valid tokens across DP ranks.
+                    # NOTE: This usually changes the order of data in the `batch`,
+                    # which won't affect the advantage calculation (since it's based on uid),
+                    # but might affect the loss calculation (due to the change of mini-batching).
+                    # TODO: Decouple the DP balancing and mini-batching.
+                    if self.config.trainer.balance_batch:
+                        self._balance_batch(batch, metrics=metrics)
+                    # compute global_valid tokens
+                    batch.meta_info['global_token_num'] = torch.sum(batch.batch['attention_mask'], dim=-1).tolist()
+                    # recompute old_log_probs
+                    with marked_timer("old_log_prob", timing_raw, color="blue"):
+                        old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
+                        entropys = old_log_prob.batch["entropys"]
+                        response_masks = batch.batch["response_mask"]
+                        loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
+                        entropy_agg = core_algos.agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
+                        old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
+                        metrics.update(old_log_prob_metrics)
+                        old_log_prob.batch.pop("entropys")
+                        batch = batch.union(old_log_prob)
+                        if "rollout_log_probs" in batch.batch.keys():
+                            # TODO: we may want to add diff of probs too.
+                            rollout_old_log_probs = batch.batch["rollout_log_probs"]
+                            actor_old_log_probs = batch.batch["old_log_probs"]
+                            attention_mask = batch.batch["attention_mask"]
+                            responses = batch.batch["responses"]
+                            response_length = responses.size(1)
+                            response_mask = attention_mask[:, -response_length:]
+                            rollout_probs = torch.exp(rollout_old_log_probs)
+                            actor_probs = torch.exp(actor_old_log_probs)
+                            rollout_probs_diff = torch.abs(rollout_probs - actor_probs)
+                            rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool())
+                            rollout_probs_diff_max = torch.max(rollout_probs_diff)
+                            rollout_probs_diff_mean = torch.mean(rollout_probs_diff)
+                            rollout_probs_diff_std = torch.std(rollout_probs_diff)
+                            metrics.update(
+                                {
+                                    "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(),
+                                    "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(),
+                                    "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
+                                }
+                            )
+                    if self.use_reference_policy:
+                        # compute reference log_prob
+                        with marked_timer("ref", timing_raw, color="olive"):
+                            if not self.ref_in_actor:
+                                ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
+                            else:
+                                ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch)
+                            batch = batch.union(ref_log_prob)
+                    # compute values
+                    if self.use_critic:
+                        with marked_timer('values', timing_raw):
+                            pp.status("Computation", "Computing critic values", "info")
+                            values = self.critic_wg.compute_values(batch)
+                            batch = batch.union(values)
+                    with marked_timer('adv', timing_raw):
+                        # compute scores. Support both model and function-based.
+                        pp.status("Rewards", "Computing rewards", "info")
+                        if self.use_rm:
+                            # we first compute reward model score
+                            reward_tensor = self.rm_wg.compute_rm_score(batch)
+                            batch = batch.union(reward_tensor)
+                        # we combine with rule-based rm
+                        reward_tensor, train_metrics = self.reward_fn(batch)
+                        train_metrics = {k: np.mean(v) for k, v in train_metrics.items()}
+                        metrics.update(train_metrics)
+                        batch.batch['token_level_scores'] = reward_tensor
+                        # compute rewards. apply_kl_penalty if available
+                        if self.config.algorithm.use_kl_in_reward:
+                            pp.status("KL Penalty", "Applying KL penalty", "info")
+                            batch, kl_metrics = apply_kl_penalty(batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty)
+                            metrics.update(kl_metrics)
+                        else:
+                            batch.batch['token_level_rewards'] = batch.batch['token_level_scores']
+                        # compute advantages, executed on the driver process
+                        pp.status("Advantage", f"Computing {self.config.algorithm.adv_estimator} advantage", "info")
+                        batch = compute_advantage(batch,
+                                                  adv_estimator=self.config.algorithm.adv_estimator,
+                                                  gamma=self.config.algorithm.gamma,
+                                                  lam=self.config.algorithm.lam,
+                                                  num_repeat=self.config.actor_rollout_ref.rollout.n)
+                    # update critic
+                    if self.use_critic:
+                        with marked_timer('update_critic', timing_raw):
+                            pp.status("Update", "Updating critic network", "info")
+                            critic_output = self.critic_wg.update_critic(batch)
+                        critic_output_metrics = reduce_metrics(critic_output.meta_info['metrics'])
+                        metrics.update(critic_output_metrics)
+                    # implement critic warmup
+                    if self.config.trainer.critic_warmup <= self.global_steps:
+                        # update actor
+                        with marked_timer('update_actor', timing_raw):
+                            batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
+                            pp.status("Update", "Updating actor network", "info")
+                            actor_output = self.actor_rollout_wg.update_actor(batch)
+                        actor_output_metrics = reduce_metrics(actor_output.meta_info['metrics'])
+                        metrics.update(actor_output_metrics)
+                    # Log rollout generations if enabled
+                    rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
+                    if rollout_data_dir:
+                        with marked_timer("dump_rollout_generations", timing_raw, color="green"):
+                            print(batch.batch.keys())
+                            inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
+                            outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
+                            scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
+                            self._dump_generations(
+                                inputs=inputs,
+                                outputs=outputs,
+                                scores=scores,
+                                reward_extra_infos_dict=train_metrics,
+                                dump_path=rollout_data_dir,
+                            )
+                    # validate
+                    if self.val_reward_fn is not None and self.config.trainer.test_freq > 0 and \
+                        self.global_steps % self.config.trainer.test_freq == 0:
+                        with marked_timer('testing', timing_raw):
+                            pp.section_header(f"Validation (Step {self.global_steps})")
+                            pp.status("Validation", "Running validation", "info")
+                            val_metrics: dict = self._validate()
+                            if is_last_step:
+                                last_val_metrics = val_metrics
+                            # Convert metrics to table format
+                            val_metrics_table = []
+                            for k, v in val_metrics.items():
+                                val_metrics_table.append([k, f"{v:.4f}" if isinstance(v, float) else v])
+                            pp.table(["Metric", "Value"], val_metrics_table, f"Validation Results (Step {self.global_steps})")
+                        metrics.update(val_metrics)
+                    if self.config.trainer.save_freq > 0 and \
+                            self.global_steps % self.config.trainer.save_freq == 0:
+                        with marked_timer('save_checkpoint', timing_raw):
+                            pp.status("Checkpoint", f"Saving checkpoint at step {self.global_steps}", "success")
+                            self._save_checkpoint()
+                steps_duration = timing_raw["step"]
+                self.max_steps_duration = max(self.max_steps_duration, steps_duration)
+                # training metrics
+                metrics.update(
+                    {
+                        "training/global_step": self.global_steps,
+                        "training/epoch": epoch,
+                    }
+                )
+                # collect metrics
+                metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
+                metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
+                # Display key metrics in a table
+                key_metrics = {k: v for k, v in metrics.items()}
+                if key_metrics:
+                    metrics_table = []
+                    for k, v in key_metrics.items():
+                        metrics_table.append([k, f"{v:.4f}" if isinstance(v, float) else v])
+                    pp.table(["Metric", "Value"], metrics_table, f"Step {self.global_steps} Results")
+                # Display timing info
+                timing_metrics = {k: v for k, v in metrics.items() if 'time' in k}
+                if timing_metrics:
+                    timing_table = []
+                    for k, v in timing_metrics.items():
+                        timing_table.append([k, f"{v:.4f}s" if isinstance(v, float) else v])
+                    pp.table(["Operation", "Time"], timing_table, "Timing Information")
+                logger.log(data=metrics, step=self.global_steps)
+                # Show progress within epoch
+                pp.progress_bar(self.global_steps, total_steps, f"Training Progress (Epoch {epoch+1})")
+                self.global_steps += 1
+                if self.global_steps >= self.total_training_steps:
+                    pp.section_header("Training Complete")
+                    # perform validation after training
+                    if self.val_reward_fn is not None:
+                        pp.status("Validation", "Running final validation", "info")
+                        val_metrics = self._validate()
+                        # Convert metrics to table format
+                        final_metrics_table = []
+                        for k, v in val_metrics.items():
+                            final_metrics_table.append([k, f"{v:.4f}" if isinstance(v, float) else v])
+                        pp.table(["Metric", "Value"], final_metrics_table, "Final Validation Results")
+                        logger.log(data=val_metrics, step=self.global_steps)
+                    if self.config.trainer.save_freq > 0 and \
+                            (self.global_steps - 1) % self.config.trainer.save_freq != 0:
+                        with marked_timer('save_checkpoint', timing_raw):
+                            pp.status("Checkpoint", "Saving final checkpoint", "success")
+                            self._save_checkpoint()
+                    pp.status("Training", "Training completed successfully!", "success")
+                    if do_profile:
+                        self.actor_rollout_wg.stop_profile()
+                        if self.use_reference_policy:
+                            self.ref_policy_wg.stop_profile()
+                        if self.use_critic:
+                            self.critic_wg.stop_profile()
+                        if self.use_rm:
+                            self.rm_wg.stop_profile()
+                    return

absolute_zero_reasoner/trainer/ppo/ttrlvr_azr_integration.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""
+TTRLVR + AZR Integration Module
+TTRLVR의 데이터로부터 AZR의 학습에 필요한 형태로 변환하고,
+TTRLVR의 reward 계산 로직을 통합
+"""
+import os
+import pandas as pd
+from typing import Dict, List, Any, Optional
+import torch
+from transformers import AutoTokenizer
+from ...rewards.ttrlvr_reward_manager import TTRLVRRewardManager
+class TTRLVRAZRDataProcessor:
+    """TTRLVR 데이터를 AZR 학습에 맞게 처리하는 클래스"""
+    def __init__(self, tokenizer: AutoTokenizer):
+        self.tokenizer = tokenizer
+        self.reward_manager = TTRLVRRewardManager(
+            tokenizer=tokenizer,
+            num_examine=0,
+            reward_fn_extraction_type='rule',
+            math_metric='accuracy',
+            split='test',
+            splitter='boxed',
+            output_path='./ttrlvr_output',
+            max_prompt_length=2048,
+            generation_reward_config=type('obj', (object,), {
+                'use_original_code_as_ref': False,
+                'reward_type': 'code_execution',
+                'weight': 1.0
+            })
+        )
+    def load_ttrlvr_data(self, data_path: str) -> Dict[str, pd.DataFrame]:
+        """TTRLVR parquet 파일들을 로드"""
+        data_by_type = {}
+        for task_type in ['induction', 'deduction', 'abduction']:
+            file_path = os.path.join(data_path, f"{task_type}.parquet")
+            if os.path.exists(file_path):
+                df = pd.read_parquet(file_path)
+                data_by_type[task_type] = df
+        return data_by_type
+    def prepare_batch_for_azr(self, batch_data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        TTRLVR 배치 데이터를 AZR 형식으로 변환
+        Returns:
+            prompts: 프롬프트 리스트
+            metadata: 각 샘플의 메타데이터 (task_type, evaluation_data 등)
+        """
+        prompts = []
+        metadata = []
+        for data in batch_data:
+            # prompt 추출 (TTRLVR은 conversation 형식으로 저장)
+            if isinstance(data['prompt'], list) and len(data['prompt']) > 0:
+                prompt_text = data['prompt'][0].get('content', '')
+            else:
+                prompt_text = str(data['prompt'])
+            prompts.append(prompt_text)
+            # 메타데이터 구성
+            meta = {
+                'task_type': self._extract_task_type_from_uid(data.get('uid', '')),
+                'expected_solution': data.get('ground_truth', ''),
+                'problem': data.get('problem', {}),
+                'ipo_group_id': data.get('ipo_group_id', ''),
+                'uid': data.get('uid', '')
+            }
+            # evaluation_data 구성 (task 타입별)
+            if meta['task_type'] == 'induction':
+                # IPO에서 input/output 쌍 추출
+                meta['evaluation_data'] = {
+                    'input_output_pairs': [
+                        (meta['problem'].get('input', ''),
+                         meta['problem'].get('output', ''))
+                    ]
+                }
+            elif meta['task_type'] == 'deduction':
+                meta['evaluation_data'] = {
+                    'function_code': meta['problem'].get('snippet', ''),
+                    'input': meta['problem'].get('input', '')
+                }
+            elif meta['task_type'] == 'abduction':
+                meta['evaluation_data'] = {
+                    'function_code': meta['problem'].get('snippet', ''),
+                    'expected_output': meta['problem'].get('output', '')
+                }
+            metadata.append(meta)
+        return {
+            'prompts': prompts,
+            'metadata': metadata
+        }
+    def compute_rewards_for_responses(self,
+                                    prompts: List[str],
+                                    responses: List[str],
+                                    metadata: List[Dict[str, Any]]) -> List[float]:
+        """
+        모델 응답에 대한 reward 계산
+        complete_pipeline.py의 _compute_rewards_with_azr과 동일한 로직 사용
+        """
+        return self.reward_manager.compute_rewards(prompts, responses, metadata)
+    def _extract_task_type_from_uid(self, uid: str) -> str:
+        """UID에서 task 타입 추출"""
+        if 'induction' in uid:
+            return 'induction'
+        elif 'deduction' in uid:
+            return 'deduction'
+        elif 'abduction' in uid:
+            return 'abduction'
+        else:
+            return 'unknown'

absolute_zero_reasoner/utils/__init__.py ADDED Viewed

File without changes

absolute_zero_reasoner/utils/auxiliary.py ADDED Viewed

	@@ -0,0 +1,11 @@

+reflection_keywords = [
+    "wait", "recheck", "retry", "rethink", "re-verify", "re-evaluate",
+    "check again", "try again", "think again", "verify again",
+    "evaluate again", "let's correct", "however", "alternatively",
+    "reconsider", "review", "revisit", "double-check", "cross-check",
+    "second look", "reassess", "inspect", "examine again", "re-examine",
+    "revise", "adjust", "modify", "recalibrate", "pause", "reflect",
+    "clarify", "confirm", "validate again", "on second thought",
+    "in retrospect", "upon reflection", "alternately", "perhaps",
+    "maybe", "on the other hand"
+]

absolute_zero_reasoner/utils/code_utils/__init__.py ADDED Viewed

File without changes

absolute_zero_reasoner/utils/code_utils/checks.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import hashlib
+import ast
+import re
+from typing import List
+def check_determinism(code: str, inputs: str, executor, prev_output: str = None, n_runs: int = 1):
+    """expects an executor that outputs string output and status"""
+    all_outputs = set()
+    if prev_output is not None:
+        hash = hashlib.md5(str(prev_output).encode()).hexdigest()
+        all_outputs.add(hash)
+    for _ in range(n_runs):
+        result = executor.run_code(code, inputs)[0]
+        hash = hashlib.md5(str(result).encode()).hexdigest()
+        all_outputs.add(hash)
+    return len(all_outputs) == 1
+def contains_banned_imports(code: str, banned_keywords: List[str], banned_keywords_for_errors_and_exceptions: List[str] = []) -> bool:
+    """Check if code imports any banned modules using AST parsing."""
+    try:
+        tree = ast.parse(code)
+        for node in ast.walk(tree):
+            if isinstance(node, ast.Import):
+                for alias in node.names:
+                    if any(banned in alias.name.split('.') for banned in banned_keywords):
+                        return True
+            elif isinstance(node, ast.ImportFrom):
+                module = node.module.split('.') if node.module else []
+                if any(banned in module for banned in banned_keywords):
+                    return True
+                for alias in node.names:
+                    if any(banned in alias.name.split('.') for banned in banned_keywords):
+                        return True
+            if banned_keywords_for_errors_and_exceptions:
+                # Check for assert statements
+                if isinstance(node, ast.Assert) and 'assert' in banned_keywords_for_errors_and_exceptions:
+                    return True
+                # Check for raise statements
+                elif isinstance(node, ast.Raise) and 'raise' in banned_keywords_for_errors_and_exceptions:
+                    return True
+                # Check for try-except blocks
+                elif isinstance(node, ast.Try) and 'try' in banned_keywords_for_errors_and_exceptions:
+                    return True
+                # Check for except handlers
+                elif isinstance(node, ast.ExceptHandler) and 'except' in banned_keywords_for_errors_and_exceptions:
+                    return True
+        return False
+    except SyntaxError:
+        # Fallback to simple check if AST parsing fails
+        return any(re.search(rf'\b{re.escape(banned)}\b', code) for banned in banned_keywords)
+def check_no_definitions(code: str, composite_functions: List[str]) -> bool:
+    try:
+        tree = ast.parse(code)
+    except SyntaxError:
+        return False
+    for node in tree.body:
+        if isinstance(node, ast.FunctionDef) and node.name in composite_functions:
+            return False
+    return True
+def check_composite_function(code: str, composite_functions: List[str]) -> bool:
+    composite_function_names = [f"g_{i}" for i in range(len(composite_functions))]
+    try:
+        tree = ast.parse(code)
+    except SyntaxError:
+        return False
+    f_def = None
+    for node in tree.body:
+        if isinstance(node, ast.FunctionDef) and node.name == 'f':
+            f_def = node
+            break
+    if f_def is None:
+        return False
+    parameters = {arg.arg for arg in f_def.args.args}
+    assigned_vars_visitor = AssignedVarsVisitor()
+    for stmt in f_def.body:
+        assigned_vars_visitor.visit(stmt)
+    scope_vars = parameters | assigned_vars_visitor.assigned
+    call_checker = CallChecker(composite_function_names, scope_vars)
+    for stmt in f_def.body:
+        call_checker.visit(stmt)
+    result = call_checker.called == set(composite_function_names) and call_checker.valid
+    return result
+class AssignedVarsVisitor(ast.NodeVisitor):
+    def __init__(self):
+        self.assigned = set()
+    def visit_Assign(self, node):
+        for target in node.targets:
+            self.collect_names(target)
+        self.generic_visit(node)
+    def collect_names(self, node):
+        if isinstance(node, ast.Name):
+            self.assigned.add(node.id)
+        elif isinstance(node, (ast.Tuple, ast.List)):
+            for elt in node.elts:
+                self.collect_names(elt)
+class CallChecker(ast.NodeVisitor):
+    def __init__(self, composite_functions, scope_vars):
+        self.composite_functions = composite_functions
+        self.scope_vars = scope_vars
+        self.called = set()
+        self.valid = True
+        self.local_scopes = [{}]
+    def visit_FunctionDef(self, node):
+        self.local_scopes.append({arg.arg: None for arg in node.args.args})
+        self.generic_visit(node)
+        self.local_scopes.pop()
+    def visit_ListComp(self, node):
+        comp_scope = {}
+        for gen in node.generators:
+            if isinstance(gen.iter, ast.Name) and gen.iter.id in self.scope_vars:
+                self.collect_names(gen.target, comp_scope)
+        self.local_scopes.append(comp_scope)
+        self.visit(node.elt)
+        for gen in node.generators:
+            for comp_if in gen.ifs:
+                self.visit(comp_if)
+        self.local_scopes.pop()
+    def visit_Call(self, node):
+        if isinstance(node.func, ast.Name):
+            if node.func.id in self.composite_functions:
+                func_name = node.func.id
+                self.called.add(func_name)
+                current_scope = self.build_current_scope()
+                for arg in node.args:
+                    names = self.get_names(arg)
+                    if not all(name in current_scope for name in names):
+                        self.valid = False
+            elif node.func.id in {n.name for n in ast.walk(node) if isinstance(n, ast.FunctionDef)}:
+                for parent in ast.walk(node):
+                    if isinstance(parent, ast.FunctionDef) and parent.name == node.func.id:
+                        for param, arg in zip(parent.args.args, node.args):
+                            if isinstance(arg, ast.Name):
+                                self.local_scopes[-1][param.arg] = arg.id
+        self.generic_visit(node)
+    def build_current_scope(self):
+        scope = set(self.scope_vars)
+        for local_scope in self.local_scopes:
+            scope.update(local_scope.keys())
+            for mapped_var in local_scope.values():
+                if mapped_var:
+                    scope.add(mapped_var)
+        return scope
+    def collect_names(self, node, scope_dict):
+        if isinstance(node, ast.Name):
+            scope_dict[node.id] = None
+        elif isinstance(node, (ast.Tuple, ast.List)):
+            for elt in node.elts:
+                self.collect_names(elt, scope_dict)
+    def get_names(self, node):
+        return [n.id for n in ast.walk(node) if isinstance(n, ast.Name)
+                and isinstance(n.ctx, ast.Load)
+                and n.id not in self.composite_functions]

absolute_zero_reasoner/utils/code_utils/parsers.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import ast
+import re
+from typing import List
+def parse_imports(code_snippet: str) -> List[str]:
+    imports = []
+    try:
+        tree = ast.parse(code_snippet)
+        for node in ast.walk(tree):
+            if isinstance(node, (ast.Import, ast.ImportFrom)):
+                # Reconstruct import line from AST node
+                if isinstance(node, ast.Import):
+                    import_line = "import " + ", ".join(
+                        [alias.name + (f" as {alias.asname}" if alias.asname else "")
+                            for alias in node.names]
+                    )
+                else:
+                    module = node.module or ""
+                    import_line = f"from {module} import " + ", ".join(
+                        [alias.name + (f" as {alias.asname}" if alias.asname else "")
+                            for alias in node.names]
+                    )
+                    if node.level > 0:
+                        import_line = f"from {'.' * node.level}{module} import " + ", ".join(
+                            [alias.name + (f" as {alias.asname}" if alias.asname else "")
+                                for alias in node.names]
+                        )
+                imports.append(import_line)
+    except Exception as e:
+        import_pattern = r"^\s*(?:from|import)\s+.*$"
+        imports = [i.strip() for i in re.findall(import_pattern, code_snippet, re.MULTILINE)]
+    return imports
+def parse_error(error_message: str) -> str:
+    # split by colon
+    error_message = error_message.split(':')[0]
+    return error_message.strip()
+def replace_main_function_name(code: str, old_name: str, new_name: str) -> str:
+    """
+    Replace all occurrences of `old_name` with `new_name` in the code.
+    Replace the definition and all recursive calls of `old_name` with `new_name`.
+    """
+    tree = ast.parse(code)
+    for node in ast.walk(tree):
+        if isinstance(node, ast.FunctionDef) and node.name == old_name:
+            node.name = new_name
+        elif isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == old_name:
+            node.func.id = new_name
+    return ast.unparse(tree)
+def remove_comments_and_docstrings(code: str) -> str:
+    """
+    Remove all comments and docstrings from the code.
+    """
+    try:
+        tree = ast.parse(code)
+        for node in ast.walk(tree):
+            if isinstance(node, (ast.AsyncFunctionDef, ast.FunctionDef, ast.ClassDef, ast.Module)):
+                # Remove all leading docstrings
+                while node.body and isinstance(node.body[0], ast.Expr):
+                    expr = node.body[0].value
+                    if isinstance(expr, (ast.Str, ast.Constant)) and (
+                        isinstance(expr.value, str) if isinstance(expr, ast.Constant) else True
+                    ):
+                        node.body.pop(0)
+                    else:
+                        break
+        # Convert back to code - AST unparse already removes comments
+        code_without_docstrings = ast.unparse(tree)
+        # Only remove empty lines and trim whitespace
+        lines = [
+            line.rstrip()
+            for line in code_without_docstrings.split('\n')
+            if line.strip()
+        ]
+        return '\n'.join(lines)
+    except Exception as e:
+        return code  # Return original code if parsing fails
+def remove_any_not_definition_imports(code: str) -> str:
+    """
+    Remove anything that is not a definition or import.
+    Preserves:
+    - Import/From imports
+    - Class definitions
+    - Function/AsyncFunction definitions
+    Removes:
+    - Top-level assignments
+    - Standalone expressions
+    - Constant declarations
+    """
+    class DefinitionFilter(ast.NodeTransformer):
+        def visit_Module(self, node):
+            # Keep only definitions and imports (explicitly exclude assignments)
+            node.body = [
+                n for n in node.body
+                if isinstance(n, (
+                    ast.Import,
+                    ast.ImportFrom,
+                    ast.FunctionDef,
+                    ast.AsyncFunctionDef,
+                    ast.ClassDef
+                ))
+            ]
+            return node
+    try:
+        tree = ast.parse(code)
+        tree = DefinitionFilter().visit(tree)
+        ast.fix_missing_locations(tree)
+        # Remove empty lines and format
+        cleaned = ast.unparse(tree)
+        return '\n'.join([line for line in cleaned.split('\n') if line.strip()])
+    except Exception as e:
+        return code
+class PrintRemover(ast.NodeTransformer):
+    def visit_Expr(self, node):
+        # Handle top-level print statements
+        if isinstance(node.value, ast.Call) and isinstance(node.value.func, ast.Name) and node.value.func.id == 'print':
+            return None
+        return node
+    def visit_Call(self, node):
+        # Handle print calls in other contexts (like assignments)
+        if isinstance(node.func, ast.Name) and node.func.id == 'print':
+            return ast.Constant(value=None)
+        return node
+    def _handle_block(self, node):
+        self.generic_visit(node)
+        if not node.body:
+            node.body.append(ast.Pass())
+        return node
+    def visit_For(self, node):
+        return self._handle_block(node)
+    def visit_While(self, node):
+        return self._handle_block(node)
+    def visit_FunctionDef(self, node):
+        return self._handle_block(node)
+    def visit_AsyncFunctionDef(self, node):
+        return self._handle_block(node)
+    def visit_If(self, node):
+        return self._handle_block(node)
+    def visit_With(self, node):
+        return self._handle_block(node)
+    def visit_Try(self, node):
+        self.generic_visit(node)
+        # Handle main try body
+        if not node.body:
+            node.body.append(ast.Pass())
+        # Handle except handlers
+        for handler in node.handlers:
+            if not handler.body:
+                handler.body.append(ast.Pass())
+        # Handle else clause
+        if node.orelse and not node.orelse:
+            node.orelse.append(ast.Pass())
+        # Handle finally clause
+        if node.finalbody and not node.finalbody:
+            node.finalbody.append(ast.Pass())
+        return node
+def remove_print_statements(code: str) -> str:
+    """
+    Remove all print statements from the code.
+    """
+    tree = ast.parse(code)
+    tree = PrintRemover().visit(tree)
+    ast.fix_missing_locations(tree)
+    return ast.unparse(tree)
+if __name__ == "__main__":
+    print(parse_error("NameError: name 'x' is not defined"))
+    print(parse_error("TypeError: unsupported operand type(s) for -: 'str' and 'str'"))
+    print(parse_error("ValueError: invalid literal for int() with base 10: 'x'"))

absolute_zero_reasoner/utils/code_utils/python_executor.py ADDED Viewed

	@@ -0,0 +1,435 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# https://github.com/QwenLM/QwQ/blob/main/eval/eval/math_opensource_utils/python_executor.py
+import copy
+import datetime
+import io
+import logging
+import pickle
+import traceback
+from concurrent.futures import TimeoutError
+from contextlib import redirect_stdout
+from functools import partial
+from typing import Any, Dict, Optional, List, Tuple
+import ast
+import time
+import numpy as np
+import dateutil.relativedelta
+import regex
+from pebble import ProcessPool
+from timeout_decorator import timeout
+from tqdm import tqdm
+from absolute_zero_reasoner.utils.code_utils.templates import (
+    RUN_CODE_TEMPLATE,
+    EVAL_INPUT_PREDICTION_TEMPLATE,
+    EVAL_OUTPUT_PREDICTION_TEMPLATE,
+    VALIDATE_CODE_TEMPLATE,
+    CHECK_DETERMINISM_TEMPLATE,
+    EVAL_K_INPUT_PREDICTION_TEMPLATE,
+    EVAL_K_OUTPUT_PREDICTION_TEMPLATE,
+)
+from absolute_zero_reasoner.utils.code_utils.checks import contains_banned_imports
+from absolute_zero_reasoner.utils.code_utils.parsers import parse_error
+class GenericRuntime:
+    GLOBAL_DICT = {}
+    LOCAL_DICT = None
+    HEADERS = []
+    def __init__(self):
+        self._global_vars = copy.copy(self.GLOBAL_DICT)
+        self._local_vars = copy.copy(self.LOCAL_DICT) if self.LOCAL_DICT else None
+        for c in self.HEADERS:
+            self.exec_code(c)
+    def exec_code(self, code_piece: str) -> None:
+        if regex.search(r'(\s|^)?input\(', code_piece):
+            # regex.search(r'(\s|^)?os.', code_piece):
+            raise RuntimeError()
+        exec(code_piece, self._global_vars)
+        # TODO: use: https://github.com/shroominic/codebox-api
+        # @high safe exec in sandbox
+        # byte_code = compile_restricted(
+        #     code_piece,
+        #     filename='<inline code>',
+        #     mode='exec'
+        # )
+        # print("global vars:", self._global_vars)
+        # _print_ = PrintCollector
+        # exec(byte_code, {'__builtins__': utility_builtins}, None)
+    def eval_code(self, expr: str) -> Any:
+        return eval(expr, self._global_vars)
+    def inject(self, var_dict: Dict[str, Any]) -> None:
+        for k, v in var_dict.items():
+            self._global_vars[k] = v
+    @property
+    def answer(self):
+        return self._global_vars['answer']
+class DateRuntime(GenericRuntime):
+    GLOBAL_DICT = {
+        'datetime': datetime.datetime,
+        'timedelta': dateutil.relativedelta.relativedelta,
+        'relativedelta': dateutil.relativedelta.relativedelta
+    }
+class CustomDict(dict):
+    def __iter__(self):
+        return list(super().__iter__()).__iter__()
+class ColorObjectRuntime(GenericRuntime):
+    GLOBAL_DICT = {'dict': CustomDict}
+class PythonExecutor:
+    def __init__(
+        self,
+        runtime: Optional[Any] = None,
+        get_answer_symbol: Optional[str] = None,
+        get_answer_expr: Optional[str] = None,
+        get_answer_from_stdout: bool = False,
+        timeout_length: int = 10,
+        ast_check: bool = False,
+        max_workers: int = 1,
+    ) -> None:
+        self.runtime = runtime if runtime else GenericRuntime()
+        self.answer_symbol = get_answer_symbol
+        self.answer_expr = get_answer_expr
+        self.get_answer_from_stdout = get_answer_from_stdout
+        self.timeout_length = timeout_length
+        self.ast_check = ast_check
+        self.max_workers = max_workers
+        self._process_pool = None
+    def __del__(self):
+        try:
+            self.cleanup()
+            # self.pool.terminate()
+        except Exception as e:
+            print(f"Error terminating pool: {e}")
+            pass
+    def cleanup(self):
+        """Explicitly clean up the process pool"""
+        if self._process_pool is not None:
+            self._process_pool.close()
+            self._process_pool.join()
+            self._process_pool = None
+    def _get_process_pool(self, size_hint):
+        """Get or create a ProcessPool with appropriate size"""
+        if self._process_pool is None:
+            self._process_pool = ProcessPool(max_workers=min(size_hint, self.max_workers))
+        return self._process_pool
+    def process_generation_to_code(self, gens: str):
+        return [g.strip().split('\n') for g in gens]
+    def run_code(self, code: str, inputs: str, imports: List[str] = []) -> Tuple[str, str]:
+        if isinstance(imports, np.ndarray):
+            imports = imports.tolist()
+        if imports:
+            code = '\n'.join(imports) + '\n' + code
+        code_snippet = RUN_CODE_TEMPLATE.format(code=code, inputs=inputs)
+        # print(code_snippet)
+        if self.ast_check:
+            try:
+                ast.parse(code_snippet)
+            except:
+                return '', 'error'
+        return self.apply(code_snippet)
+    def validate_code(self, code: str, inputs: str, imports: List[str] = []) -> bool:
+        if isinstance(imports, np.ndarray):
+            imports = imports.tolist()
+        if imports:
+            code = '\n'.join(imports) + '\n' + code
+        code_snippet = VALIDATE_CODE_TEMPLATE.format(code=code, inputs=inputs)
+        if self.ast_check:
+            try:
+                ast.parse(code_snippet)
+            except:
+                return False
+        _, status = self.apply(code_snippet)
+        return not 'error' in status.lower()
+    def eval_input_prediction(self, code: str, gold_output: str, agent_input: str, imports: List[str] = []) -> float:
+        if isinstance(imports, np.ndarray):
+            imports = imports.tolist()
+        if imports:
+            code = '\n'.join(imports) + '\n' + code
+        code_snippet = EVAL_INPUT_PREDICTION_TEMPLATE.format(code=code, gold_output=gold_output, agent_input=agent_input)
+        if self.ast_check:
+            try:
+                ast.parse(code_snippet)
+            except:
+                return 0.0
+        max_retries = 3
+        for retry in range(max_retries):
+            try:
+                correct, status = self.apply(code_snippet)
+                return 0.0 if 'error' in status.lower() or not eval(correct) else 1.0
+            except Exception as e:
+                if retry == max_retries - 1:
+                    error_details = traceback.format_exc()
+                    print(f"Error in eval_input_prediction: {e}\n{error_details}")
+                    return
+                time.sleep(0.1 * (retry + 1))  # Exponential backoff
+    def eval_output_prediction(self, code: str, gold_output: str, agent_output: str, imports: List[str] = []) -> float:
+        try: # fast check if we dont need to run the code
+            if eval(gold_output) == eval(agent_output):
+                return 1.0
+        except:
+            pass
+        if isinstance(imports, np.ndarray):
+            imports = imports.tolist()
+        if imports:
+            code = '\n'.join(imports) + '\n' + code
+        code_snippet = EVAL_OUTPUT_PREDICTION_TEMPLATE.format(code=code, gold_output=gold_output, agent_output=agent_output)
+        if self.ast_check:
+            try:
+                ast.parse(code_snippet)
+            except:
+                return 0.0
+        max_retries = 3
+        for retry in range(max_retries):
+            try:
+                correct, status = self.apply(code_snippet)
+                return 0.0 if 'error' in status.lower() or not eval(correct) else 1.0
+            except Exception as e:
+                if retry == max_retries - 1:
+                    error_details = traceback.format_exc()
+                    print(f"Error in eval_output_prediction: {e}\n{error_details}")
+                    return
+                time.sleep(0.1 * (retry + 1))  # Exponential backoff
+    def eval_k_input_prediction(self, code: str, gold_output: str, k_agent_inputs: List[str], imports: List[str] = []) -> List[float]:
+        if isinstance(imports, np.ndarray):
+            imports = imports.tolist()
+        if imports:
+            code = '\n'.join(imports) + '\n' + code
+        invalid_lists = []
+        valid_k_agent_inputs = []
+        for k_agent_input in k_agent_inputs:
+            try:
+                ast.parse(f'f({k_agent_input})')
+                valid_k_agent_inputs.append(k_agent_input)
+            except:
+                invalid_lists.append(0.0)
+        acc_list, status = self.apply(EVAL_K_INPUT_PREDICTION_TEMPLATE(code=code, gold_output=gold_output, k_agent_inputs=valid_k_agent_inputs))
+        assert 'error' not in status.lower()
+        output_acc = eval(acc_list) + invalid_lists
+        assert len(output_acc) == len(k_agent_inputs)
+        return output_acc
+    def eval_k_output_prediction(self, code: str, gold_output: str, k_agent_outputs: List[str], imports: List[str] = []) -> List[float]:
+        if isinstance(imports, np.ndarray):
+            imports = imports.tolist()
+        if imports:
+            code = '\n'.join(imports) + '\n' + code
+        invalid_lists = []
+        valid_k_agent_outputs = []
+        for k_agent_output in k_agent_outputs:
+            try:
+                if k_agent_output != '':
+                    ast.parse(f'f({k_agent_output})')
+                    valid_k_agent_outputs.append(k_agent_output)
+                else:
+                    invalid_lists.append(0.0)
+            except:
+                invalid_lists.append(0.0)
+        acc_list, status = self.apply(EVAL_K_OUTPUT_PREDICTION_TEMPLATE(code=code, gold_output=gold_output, k_agent_outputs=valid_k_agent_outputs))
+        assert 'error' not in status.lower()
+        output_acc = eval(acc_list) + invalid_lists
+        assert len(output_acc) == len(k_agent_outputs)
+        return output_acc
+    def check_all(
+        self,
+        code: str,
+        inputs: str,
+        banned_keywords: List[str] = [],
+        check_determinism: bool = True,
+        imports: List[str] = [],
+        check_error: bool = False,
+        banned_keywords_for_errors_and_exceptions: List[str] = [],
+    ) -> Tuple[bool, str]:
+        if isinstance(imports, np.ndarray):
+            imports = imports.tolist()
+        if imports:
+            code = '\n'.join(imports) + '\n' + code
+        if contains_banned_imports(code=code, banned_keywords=banned_keywords, banned_keywords_for_errors_and_exceptions=banned_keywords_for_errors_and_exceptions if check_error else []):
+            return False, None
+        if check_error:
+            code_snippet = RUN_CODE_TEMPLATE.format(code=code, inputs=inputs)
+            try:
+                ast.parse(code_snippet)
+            except:
+                return False, 'error'
+            output, status = self.apply(code_snippet)
+            if check_determinism: # run the code again, see if outputs are same
+                output_2, status_2 = self.apply(code_snippet)
+                if status_2.lower() != status.lower() and output != output_2:
+                    return False, 'error'
+            # True if the code is valid code but might have error, output no error if the code returns something
+            return True, 'NoError' if status.lower() == 'done' else parse_error(status)
+        else:
+            if check_determinism:
+                code_snippet = CHECK_DETERMINISM_TEMPLATE.format(code=code, inputs=inputs)
+            else:
+                code_snippet = RUN_CODE_TEMPLATE.format(code=code, inputs=inputs)
+            if self.ast_check:
+                try:
+                    ast.parse(code_snippet)
+                except:
+                    return False, 'error'
+            output, status = self.apply(code_snippet)
+            return not 'error' in status.lower(), output
+    @staticmethod
+    def execute(
+        code,
+        get_answer_from_stdout=None,
+        runtime=None,
+        answer_symbol=None,
+        answer_expr=None,
+        timeout_length=10,
+        auto_mode=False
+    ):
+        try:
+            if auto_mode:
+                if "print(" in code[-1]:
+                    program_io = io.StringIO()
+                    with redirect_stdout(program_io):
+                        timeout(timeout_length)(runtime.exec_code)('\n'.join(code))
+                    program_io.seek(0)
+                    result = program_io.read()
+                else:
+                    # print(code)
+                    timeout(timeout_length)(runtime.exec_code)('\n'.join(code[:-1]))
+                    result = timeout(timeout_length)(runtime.eval_code)(code[-1])
+            else:
+                if get_answer_from_stdout:
+                    program_io = io.StringIO()
+                    with redirect_stdout(program_io):
+                        timeout(timeout_length)(runtime.exec_code)('\n'.join(code))
+                    program_io.seek(0)
+                    result = program_io.read()
+                elif answer_symbol:
+                    timeout(timeout_length)(runtime.exec_code)('\n'.join(code))
+                    result = runtime._global_vars[answer_symbol]
+                elif answer_expr:
+                    timeout(timeout_length)(runtime.exec_code)('\n'.join(code))
+                    result = timeout(timeout_length)(runtime.eval_code)(answer_expr)
+                else:
+                    timeout(timeout_length)(runtime.exec_code)('\n'.join(code[:-1]))
+                    result = timeout(timeout_length)(runtime.eval_code)(code[-1])
+            report = "Done"
+            str(result)           # codec check
+            pickle.dumps(result)  # serialization check
+        except:
+            result = ''
+            report = traceback.format_exc().split('\n')[-2]
+        return result, report
+    def apply(self, code):
+        return self.batch_apply([code])[0]
+    @staticmethod
+    def truncate(s, max_length=400):
+        half = max_length // 2
+        if len(s) > max_length:
+            s = s[:half] + "..." + s[-half:]
+        return s
+    def batch_apply(self, batch_code):
+        all_code_snippets = self.process_generation_to_code(batch_code)
+        timeout_cnt = 0
+        all_exec_results = []
+        pool = self._get_process_pool(len(all_code_snippets))
+        executor = partial(
+            self.execute,
+            get_answer_from_stdout=self.get_answer_from_stdout,
+            runtime=self.runtime,
+            answer_symbol=self.answer_symbol,
+            answer_expr=self.answer_expr,
+            timeout_length=self.timeout_length,
+            auto_mode=True
+        )
+        try:
+            future = pool.map(executor, all_code_snippets, timeout=self.timeout_length)
+            iterator = future.result()
+            if len(all_code_snippets) > 100:
+                progress_bar = tqdm(total=len(all_code_snippets), desc="Execute")
+            else:
+                progress_bar = None
+            while True:
+                try:
+                    result = next(iterator)
+                    all_exec_results.append(result)
+                except StopIteration:
+                    break
+                except TimeoutError as error:
+                    logging.warning(f"Timeout error in code execution: {error}")
+                    all_exec_results.append(("", "Timeout Error"))
+                    timeout_cnt += 1
+                except Exception as error:
+                    logging.warning(f"Error in code execution: {error}")
+                    all_exec_results.append(("", f"Error: {str(error)}"))
+                if progress_bar is not None:
+                    progress_bar.update(1)
+            if progress_bar is not None:
+                progress_bar.close()
+        except Exception as e:
+            logging.error(f"Critical error in batch execution: {e}")
+            # Make sure we have results for all snippets
+            while len(all_exec_results) < len(all_code_snippets):
+                all_exec_results.append(("", f"Critical Error: {str(e)}"))
+            # Cleanup the pool on critical errors
+            self.cleanup()
+        batch_results = []
+        for code, (res, report) in zip(all_code_snippets, all_exec_results):
+            # post processing
+            res, report = str(res).strip(), str(report).strip()
+            res, report = self.truncate(res), self.truncate(report)
+            batch_results.append((res, report))
+        return batch_results
+def _test():
+    batch_code = [
+"""
+def f(a):
+    return a
+print(f(1,2))
+"""
+    ]
+    executor = PythonExecutor(get_answer_from_stdout=True)
+    predictions = executor.apply(batch_code[0])
+    print(predictions)
+if __name__ == '__main__':
+    _test()

absolute_zero_reasoner/utils/code_utils/sandboxfusion_executor.py ADDED Viewed

	@@ -0,0 +1,372 @@

+import traceback
+from typing import List, Tuple
+import ast
+import time
+import requests
+import docker
+from docker.errors import DockerException
+import socket
+import numpy as np
+from pebble import ProcessPool
+from sandbox_fusion import run_code, RunCodeRequest, set_endpoint, RunStatus
+from absolute_zero_reasoner.utils.code_utils.templates import (
+    RUN_CODE_TEMPLATE_REPR,
+    EVAL_INPUT_PREDICTION_TEMPLATE_REPR,
+    EVAL_OUTPUT_PREDICTION_TEMPLATE_REPR,
+    VALIDATE_CODE_TEMPLATE_REPR,
+    CHECK_DETERMINISM_TEMPLATE_REPR,
+    EVAL_K_INPUT_PREDICTION_TEMPLATE,
+    EVAL_K_OUTPUT_PREDICTION_TEMPLATE,
+)
+from absolute_zero_reasoner.utils.code_utils.checks import contains_banned_imports
+from absolute_zero_reasoner.utils.code_utils.parsers import parse_error
+# Docker images
+IMAGES = {
+    'global': 'volcengine/sandbox-fusion:server-20250609',
+    'china': 'vemlp-cn-beijing.cr.volces.com/preset-images/code-sandbox:server-20250609'
+}
+class DockerAPIRunner:
+    def __init__(self, use_china_mirror=True, silent=False):
+        self.image = IMAGES['china'] if use_china_mirror else IMAGES['global']
+        self.container = None
+        self.silent = silent
+        self.client = docker.from_env()
+        self.port = self._find_free_port()
+    def _find_free_port(self):
+        """Find an available port dynamically"""
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(('', 0))
+            s.listen(1)
+            port = s.getsockname()[1]
+        return port
+    def start(self):
+        """Start the Docker container using Docker API"""
+        try:
+            # Pull image if not exists
+            if not self.silent:
+                print(f"Pulling image: {self.image}")
+            self.client.images.pull(self.image)
+            # Run container
+            self.container = self.client.containers.run(
+                self.image,
+                ports={'8080/tcp': self.port},
+                detach=True,
+                remove=True  # Auto-remove when stopped
+            )
+            if not self.silent:
+                print(f"Container started: {self.container.short_id}")
+            return True
+        except DockerException as e:
+            if not self.silent:
+                print(f"Error starting container: {e}")
+            return False
+    def stop(self):
+        """Stop the Docker container"""
+        if self.container:
+            try:
+                self.container.stop()
+                if not self.silent:
+                    print("Container stopped")
+                return True
+            except DockerException as e:
+                if not self.silent:
+                    print(f"Error stopping container: {e}")
+                return False
+        return False
+    def _wait_for_container_ready(self, max_wait_time: int = 60, check_interval: float = 1.0):
+        """Wait for the Docker container to be ready"""
+        if not self.container:
+            raise Exception("Container not started")
+        start_time = time.time()
+        while time.time() - start_time < max_wait_time:
+            # Reload container status
+            self.container.reload()
+            if not self.silent:
+                print(f"Container status: {self.container.status}")
+            if self.container.status == 'running':
+                # Container is running, now check if service is ready
+                # First try a simple port connection test
+                try:
+                    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                    sock.settimeout(2)
+                    result = sock.connect_ex(('localhost', self.port))
+                    sock.close()
+                    if result == 0:  # Port is open
+                        # Try to make a simple request to test the service
+                        try:
+                            response = requests.get(f'http://localhost:{self.port}/', timeout=2)
+                            if not self.silent:
+                                print(f"Service responded with status: {response.status_code}")
+                            return True  # Service is responding
+                        except requests.exceptions.RequestException:
+                            # Try alternative endpoints or just accept that port is open
+                            if not self.silent:
+                                print(f"Port {self.port} is open, assuming service is ready")
+                            return True
+                except:
+                    pass
+            elif self.container.status in ['exited', 'dead']:
+                # Get container logs for debugging
+                logs = self.container.logs().decode('utf-8')
+                raise Exception(f"Container failed to start. Status: {self.container.status}. Logs: {logs[:500]}")
+            time.sleep(check_interval)
+        # Get final container logs for debugging
+        logs = self.container.logs().decode('utf-8') if self.container else "No container"
+        raise Exception(f"Container not ready after {max_wait_time} seconds. Final status: {self.container.status if self.container else 'None'}. Logs: {logs[:500]}")
+class SandboxfusionExecutor:
+    def __init__(
+        self,
+        timeout_length: int = 10,
+        ast_check: bool = False,
+        max_workers: int = 1,
+        use_china_mirror: bool = True,
+    ) -> None:
+        self.runner = DockerAPIRunner(use_china_mirror=use_china_mirror)
+        running = self.runner.start()
+        if not running:
+            raise Exception("Failed to start Sandboxfusion Docker container")
+        # Wait for the container to be ready
+        self._wait_for_container_ready()
+        set_endpoint(f'http://localhost:{self.runner.port}')
+        self.timeout_length = timeout_length
+        self.ast_check = ast_check
+        self.max_workers = max_workers
+    def _wait_for_container_ready(self, max_wait_time: int = 60, check_interval: float = 1.0):
+        """Wait for the Docker container to be ready"""
+        self.runner._wait_for_container_ready(max_wait_time, check_interval)
+    def __del__(self):
+        try:
+            self.cleanup()
+            self.runner.stop()
+        except Exception as e:
+            print(f"Error terminating pool: {e}")
+            pass
+    def cleanup(self):
+        self.runner.stop()
+    def process_generation_to_code(self, gens: str):
+        return [g.strip().split('\n') for g in gens]
+    def run_code(self, code: str, inputs: str, imports: List[str] = []) -> Tuple[str, str]:
+        if isinstance(imports, np.ndarray):
+            imports = imports.tolist()
+        if imports:
+            code = '\n'.join(imports) + '\n' + code
+        code_snippet = RUN_CODE_TEMPLATE_REPR.format(code=code, inputs=inputs)
+        # print(code_snippet)
+        if self.ast_check:
+            try:
+                ast.parse(code_snippet)
+            except:
+                return '', 'error'
+        return self.apply(code_snippet)
+    def validate_code(self, code: str, inputs: str, imports: List[str] = []) -> bool:
+        if isinstance(imports, np.ndarray):
+            imports = imports.tolist()
+        if imports:
+            code = '\n'.join(imports) + '\n' + code
+        code_snippet = VALIDATE_CODE_TEMPLATE_REPR.format(code=code, inputs=inputs)
+        if self.ast_check:
+            try:
+                ast.parse(code_snippet)
+            except:
+                return False
+        _, status = self.apply(code_snippet)
+        return not 'error' in status.lower()
+    def eval_input_prediction(self, code: str, gold_output: str, agent_input: str, imports: List[str] = []) -> float:
+        if isinstance(imports, np.ndarray):
+            imports = imports.tolist()
+        if imports:
+            code = '\n'.join(imports) + '\n' + code
+        code_snippet = EVAL_INPUT_PREDICTION_TEMPLATE_REPR.format(code=code, gold_output=gold_output, agent_input=agent_input)
+        if self.ast_check:
+            try:
+                ast.parse(code_snippet)
+            except:
+                return 0.0
+        max_retries = 3
+        for retry in range(max_retries):
+            try:
+                correct, status = self.apply(code_snippet)
+                return 0.0 if 'error' in status.lower() or not eval(correct) else 1.0
+            except Exception as e:
+                if retry == max_retries - 1:
+                    error_details = traceback.format_exc()
+                    print(f"Error in eval_input_prediction: {e}\n{error_details}")
+                    return
+                time.sleep(0.1 * (retry + 1))  # Exponential backoff
+    def eval_output_prediction(self, code: str, gold_output: str, agent_output: str, imports: List[str] = []) -> float:
+        try: # fast check if we dont need to run the code
+            if eval(gold_output) == eval(agent_output):
+                return 1.0
+        except:
+            pass
+        if isinstance(imports, np.ndarray):
+            imports = imports.tolist()
+        if imports:
+            code = '\n'.join(imports) + '\n' + code
+        code_snippet = EVAL_OUTPUT_PREDICTION_TEMPLATE_REPR.format(code=code, gold_output=gold_output, agent_output=agent_output)
+        if self.ast_check:
+            try:
+                ast.parse(code_snippet)
+            except:
+                return 0.0
+        max_retries = 3
+        for retry in range(max_retries):
+            try:
+                correct, status = self.apply(code_snippet)
+                return 0.0 if 'error' in status.lower() or not eval(correct) else 1.0
+            except Exception as e:
+                if retry == max_retries - 1:
+                    error_details = traceback.format_exc()
+                    print(f"Error in eval_output_prediction: {e}\n{error_details}")
+                    return
+                time.sleep(0.1 * (retry + 1))  # Exponential backoff
+    def eval_k_input_prediction(self, code: str, gold_output: str, k_agent_inputs: List[str], imports: List[str] = []) -> List[float]:
+        if isinstance(imports, np.ndarray):
+            imports = imports.tolist()
+        if imports:
+            code = '\n'.join(imports) + '\n' + code
+        invalid_lists = []
+        valid_k_agent_inputs = []
+        for k_agent_input in k_agent_inputs:
+            try:
+                ast.parse(f'f({k_agent_input})')
+                valid_k_agent_inputs.append(k_agent_input)
+            except:
+                invalid_lists.append(0.0)
+        acc_list, status = self.apply(EVAL_K_INPUT_PREDICTION_TEMPLATE(code=code, gold_output=gold_output, k_agent_inputs=valid_k_agent_inputs, repr_output=True))
+        assert 'error' not in status.lower()
+        output_acc = eval(acc_list) + invalid_lists
+        assert len(output_acc) == len(k_agent_inputs)
+        return output_acc
+    def eval_k_output_prediction(self, code: str, gold_output: str, k_agent_outputs: List[str], imports: List[str] = []) -> List[float]:
+        if isinstance(imports, np.ndarray):
+            imports = imports.tolist()
+        if imports:
+            code = '\n'.join(imports) + '\n' + code
+        invalid_lists = []
+        valid_k_agent_outputs = []
+        for k_agent_output in k_agent_outputs:
+            try:
+                if k_agent_output != '':
+                    ast.parse(f'f({k_agent_output})')
+                    valid_k_agent_outputs.append(k_agent_output)
+                else:
+                    invalid_lists.append(0.0)
+            except:
+                invalid_lists.append(0.0)
+        acc_list, status = self.apply(EVAL_K_OUTPUT_PREDICTION_TEMPLATE(code=code, gold_output=gold_output, k_agent_outputs=valid_k_agent_outputs, repr_output=True))
+        assert 'error' not in status.lower()
+        output_acc = eval(acc_list) + invalid_lists
+        assert len(output_acc) == len(k_agent_outputs)
+        return output_acc
+    def check_all(
+        self,
+        code: str,
+        inputs: str,
+        banned_keywords: List[str] = [],
+        check_determinism: bool = True,
+        imports: List[str] = [],
+        check_error: bool = False,
+        banned_keywords_for_errors_and_exceptions: List[str] = [],
+    ) -> Tuple[bool, str]:
+        if isinstance(imports, np.ndarray):
+            imports = imports.tolist()
+        if imports:
+            code = '\n'.join(imports) + '\n' + code
+        if contains_banned_imports(code=code, banned_keywords=banned_keywords, banned_keywords_for_errors_and_exceptions=banned_keywords_for_errors_and_exceptions if check_error else []):
+            return False, None
+        if check_error:
+            code_snippet = RUN_CODE_TEMPLATE_REPR.format(code=code, inputs=inputs)
+            try:
+                ast.parse(code_snippet)
+            except:
+                return False, 'error'
+            output, status = self.apply(code_snippet)
+            if check_determinism: # run the code again, see if outputs are same
+                output_2, status_2 = self.apply(code_snippet)
+                if status_2.lower() != status.lower() and output != output_2:
+                    return False, 'error'
+            # True if the code is valid code but might have error, output no error if the code returns something
+            return True, 'NoError' if status.lower() == 'done' else parse_error(status)
+        else:
+            if check_determinism:
+                code_snippet = CHECK_DETERMINISM_TEMPLATE_REPR.format(code=code, inputs=inputs)
+            else:
+                code_snippet = RUN_CODE_TEMPLATE_REPR.format(code=code, inputs=inputs)
+            if self.ast_check:
+                try:
+                    ast.parse(code_snippet)
+                except:
+                    return False, 'error'
+            output, status = self.apply(code_snippet)
+            return not 'error' in status.lower(), output
+    def apply(self, code) -> Tuple[str, str]:
+        try:
+            response = run_code(
+                RunCodeRequest(
+                    code=code,
+                    language='python',
+                    compile_timeout=self.timeout_length,
+                    run_timeout=self.timeout_length,
+                )
+            )
+            if response.status == RunStatus.Success:
+                # taking [1:-1] to exclude prefix space and suffix newline
+                return response.run_result.stdout.split('<FINAL_REPR_SYMBOL>')[-1][1:-1], 'done'
+            else:
+                return '', 'error'
+        except Exception as e:
+            error_msg = f"Execution error: {str(e)}"
+            return error_msg, 'error'
+def _test():
+    batch_code = [
+"""
+def f(a):
+    return a
+print('<FINAL_REPR_SYMBOL>', repr(f(12eee)))
+"""
+    ]
+    executor = SandboxfusionExecutor()
+    predictions = executor.apply(batch_code[0])
+    print(predictions)
+if __name__ == '__main__':
+    _test()

absolute_zero_reasoner/utils/code_utils/templates.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from typing import List
+RUN_CODE_TEMPLATE = """{code}
+repr(f({inputs}))"""
+RUN_CODE_TEMPLATE_REPR = """{code}
+print('<FINAL_REPR_SYMBOL>', repr(f({inputs})))"""
+VALIDATE_CODE_TEMPLATE = """{code}
+repr(f({inputs}))"""
+VALIDATE_CODE_TEMPLATE_REPR = """{code}
+print('<FINAL_REPR_SYMBOL>', repr(f({inputs})))"""
+EVAL_INPUT_PREDICTION_TEMPLATE = """{code}
+{gold_output} == f({agent_input})"""
+EVAL_INPUT_PREDICTION_TEMPLATE_REPR = """{code}
+print('<FINAL_REPR_SYMBOL>', repr({gold_output} == f({agent_input})))"""
+EVAL_OUTPUT_PREDICTION_TEMPLATE = """{code}
+eval({gold_output}) == eval({agent_output})"""
+EVAL_OUTPUT_PREDICTION_TEMPLATE_REPR = """{code}
+print('<FINAL_REPR_SYMBOL>', repr(eval({gold_output}) == eval({agent_output})))"""
+CHECK_DETERMINISM_TEMPLATE = """{code}
+returns = f({inputs})
+if returns != f({inputs}):
+    raise Exception('Non-deterministic code')
+repr(returns)"""
+CHECK_DETERMINISM_TEMPLATE_REPR = """{code}
+returns = f({inputs})
+if returns != f({inputs}):
+    raise Exception('Non-deterministic code')
+print('<FINAL_REPR_SYMBOL>', repr(returns))"""
+def EVAL_K_INPUT_PREDICTION_TEMPLATE(code: str, gold_output: str, k_agent_inputs: List[str], repr_output: bool = False):
+    output_string = f"""{code}
+acc_list = []"""
+    for inp in k_agent_inputs:
+        output_string += f"""\ntry:
+    acc_list.append({gold_output} == f({inp}))
+except:
+    acc_list.append(False)"""
+    # then compute the mean of the list
+    if repr_output:
+        output_string += """\nprint('<FINAL_REPR_SYMBOL>', repr(acc_list))"""
+    else:
+        output_string += """\nacc_list"""
+    return output_string
+def EVAL_K_OUTPUT_PREDICTION_TEMPLATE(code: str, gold_output: str, k_agent_outputs: List[str], repr_output: bool = False):
+    output_string = f"""{code}
+acc_list = []"""
+    for out in k_agent_outputs:
+        output_string += f"""\ntry:
+    acc_list.append({gold_output} == {out})
+except:
+    acc_list.append(False)"""
+    # then compute the mean of the list
+    if repr_output:
+        output_string += """\nprint('<FINAL_REPR_SYMBOL>', repr(acc_list))"""
+    else:
+        output_string += """\nacc_list"""
+    return output_string

absolute_zero_reasoner/utils/convert2hf.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+import torch
+import fire
+from collections import defaultdict
+def main(
+    fsdp_checkpoint_path, huggingface_model_path, output_path, pretrained_tokenizer=True, world_size=4
+):
+    """
+    Convert FSDP checkpoint to HuggingFace checkpoint
+    Args:
+        fsdp_checkpoint_path: path to the FSDP checkpoint
+        huggingface_model_path: path to the HuggingFace model
+        output_path: path to save the converted checkpoint
+    Usage:
+        python reason_rl/utils/convert2hf.py \
+            checkpoints/azr/azr/test/test_answer/Qwen2.5-7B/answer_conditional/global_step_160_copy/actor \
+            checkpoints/azr/azr/test/test_answer/Qwen2.5-7B/answer_conditional/global_step_160_copy/actor/huggingface/ \
+            azr_90_composite_160_steps
+    """
+    state_dict = defaultdict(list)
+    for rank in range(int(world_size)):
+        filepath = f"{fsdp_checkpoint_path}/model_world_size_{world_size}_rank_{rank}.pt"
+        print("loading", filepath)
+        this_state_dict = torch.load(filepath)
+        for key, value in this_state_dict.items():
+            state_dict[key].append(value.to_local())
+    for key in state_dict:
+        state_dict[key] = torch.cat(state_dict[key], dim=0)
+    config = AutoConfig.from_pretrained(huggingface_model_path)
+    model = AutoModelForCausalLM.from_config(config)
+    model.load_state_dict(state_dict)
+    model.save_pretrained(output_path, max_shard_size="10GB")
+    tokenizer = AutoTokenizer.from_pretrained(huggingface_model_path)
+    tokenizer.save_pretrained(output_path)
+    # manually change the tokenizer.chat_template to
+    if pretrained_tokenizer:
+        chat_template = "{%- for message in messages -%}{{- '\n' if not loop.first -}}{{- message['content'] -}}{%- endfor -%}"
+        import os
+        import json
+        with open(os.path.join(output_path, "tokenizer_config.json"), "r") as f:
+            tokenizer_config = json.load(f)
+        tokenizer_config["chat_template"] = chat_template
+        with open(os.path.join(output_path, "tokenizer_config.json"), "w") as f:
+            json.dump(tokenizer_config, f)
+if __name__ == "__main__":
+    fire.Fire(main)

absolute_zero_reasoner/utils/dataset/__init__.py ADDED Viewed

File without changes

absolute_zero_reasoner/utils/dataset/ipo_grouped_sampler.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""
+IPO Group-aware Batch Sampler for TTRLVR
+동일한 ipo_group_id를 가진 task들을 같은 배치에 묶는 커스텀 샘플러
+이를 통해 동일한 IPO triple에서 생성된 induction/deduction/abduction task들이
+함께 학습되도록 보장합니다.
+"""
+import torch
+from torch.utils.data import Sampler, BatchSampler
+from typing import Iterator, List, Optional
+import random
+from collections import defaultdict
+import pandas as pd
+import numpy as np
+class IPOGroupedBatchSampler(Sampler):
+    """동일한 IPO에서 생성된 task들을 같은 배치에 묶는 샘플러"""
+    def __init__(self,
+                 dataset,
+                 batch_size: int,
+                 shuffle: bool = True,
+                 drop_last: bool = False,
+                 seed: int = 42):
+        """
+        Args:
+            dataset: ipo_group_id를 가진 데이터셋 (TTRLVRDataset)
+            batch_size: 배치 크기
+            shuffle: 그룹 순서를 섞을지 여부
+            drop_last: 마지막 불완전한 배치를 버릴지 여부
+            seed: 랜덤 시드
+        """
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.generator = torch.Generator()
+        self.generator.manual_seed(seed)
+        # ipo_group_id별로 인덱스 그룹핑
+        self.groups = defaultdict(list)
+        self._build_groups()
+        # 배치 생성
+        self._create_batches()
+    def _build_groups(self):
+        """데이터셋에서 ipo_group_id별로 인덱스를 그룹핑"""
+        for idx in range(len(self.dataset)):
+            # TTRLVRDataset의 dataframe에서 직접 접근
+            if hasattr(self.dataset, 'dataframe'):
+                row = self.dataset.dataframe.iloc[idx]
+                ipo_group_id = row.get('ipo_group_id', None)
+                # ipo_group_id가 없으면 개별 그룹으로 처리
+                if not ipo_group_id or ipo_group_id == '':
+                    ipo_group_id = f'individual_{idx}'
+            else:
+                # Fallback: 개별 그룹
+                ipo_group_id = f'individual_{idx}'
+            self.groups[ipo_group_id].append(idx)
+        print(f"[IPOGroupedBatchSampler] Built {len(self.groups)} IPO groups from {len(self.dataset)} samples")
+        # 그룹 크기 통계
+        group_sizes = [len(indices) for indices in self.groups.values()]
+        if group_sizes:
+            print(f"  - Group sizes: min={min(group_sizes)}, max={max(group_sizes)}, avg={np.mean(group_sizes):.2f}")
+    def _create_batches(self):
+        """그룹별로 배치 생성"""
+        self.batches = []
+        # 모든 인덱스를 수집 (그룹 단위로)
+        all_indices = []
+        for group_id, indices in self.groups.items():
+            # 같은 IPO 그룹의 task들을 함께 유지
+            # 일반적으로 3개 (induction, deduction, abduction)
+            if len(indices) <= self.batch_size:
+                # 그룹이 배치 크기보다 작으면 그대로 사용
+                all_indices.extend(indices)
+            else:
+                # 그룹이 배치 크기보다 크면 분할 (드물지만 가능)
+                for i in range(0, len(indices), self.batch_size):
+                    chunk = indices[i:i + self.batch_size]
+                    all_indices.extend(chunk)
+        # 배치 생성
+        current_batch = []
+        for idx in all_indices:
+            current_batch.append(idx)
+            if len(current_batch) == self.batch_size:
+                self.batches.append(current_batch)
+                current_batch = []
+        # 마지막 불완전한 배치 처리
+        if current_batch and not self.drop_last:
+            self.batches.append(current_batch)
+        elif current_batch and self.drop_last:
+            print(f"[IPOGroupedBatchSampler] Dropped last incomplete batch of size {len(current_batch)}")
+        print(f"[IPOGroupedBatchSampler] Created {len(self.batches)} batches")
+    def __iter__(self) -> Iterator[List[int]]:
+        """배치 반복자"""
+        # 배치 순서 섞기
+        if self.shuffle:
+            indices = torch.randperm(len(self.batches), generator=self.generator).tolist()
+            shuffled_batches = [self.batches[i] for i in indices]
+        else:
+            shuffled_batches = self.batches
+        # 각 배치 yield
+        for batch in shuffled_batches:
+            # 배치 내부도 섞을 수 있음 (선택적)
+            if self.shuffle:
+                random.shuffle(batch)
+            yield batch
+    def __len__(self) -> int:
+        """전체 배치 수"""
+        return len(self.batches)
+class IPOGroupPreservingBatchSampler(BatchSampler):
+    """
+    IPO 그룹을 최대한 보존하면서 배치를 생성하는 샘플러
+    이 샘플러는 다음 우선순위로 작동합니다:
+    1. 같은 ipo_group_id를 가진 샘플들을 우선적으로 같은 배치에 배치
+    2. 배치 크기를 채우기 위해 필요시 다른 그룹의 샘플 추가
+    3. 모든 샘플이 정확히 한 번씩 사용되도록 보장
+    """
+    def __init__(self,
+                 dataset,
+                 batch_size: int,
+                 shuffle: bool = True,
+                 drop_last: bool = False,
+                 seed: int = 42):
+        """
+        Args:
+            dataset: TTRLVRDataset 인스턴스
+            batch_size: 배치 크기
+            shuffle: 배치 및 그룹 순서 섞기
+            drop_last: 마지막 불완전한 배치 버리기
+            seed: 랜덤 시드
+        """
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.seed = seed
+        # 그룹별 인덱스 구축
+        self.groups = self._build_groups()
+    def _build_groups(self):
+        """ipo_group_id별로 샘플 인덱스 그룹핑"""
+        groups = defaultdict(list)
+        for idx in range(len(self.dataset)):
+            if hasattr(self.dataset, 'dataframe'):
+                row = self.dataset.dataframe.iloc[idx]
+                ipo_group_id = row.get('ipo_group_id', '')
+                # 빈 값이면 개별 처리
+                if not ipo_group_id:
+                    ipo_group_id = f'single_{idx}'
+            else:
+                ipo_group_id = f'single_{idx}'
+            groups[ipo_group_id].append(idx)
+        return groups
+    def __iter__(self):
+        """배치 생성 및 반복"""
+        # 그룹들을 리스트로 변환
+        group_list = list(self.groups.items())
+        # 셔플
+        if self.shuffle:
+            random.seed(self.seed)
+            random.shuffle(group_list)
+        # 배치 생성
+        current_batch = []
+        for group_id, indices in group_list:
+            # 그룹 내 인덱스도 셔플
+            if self.shuffle:
+                random.shuffle(indices)
+            for idx in indices:
+                current_batch.append(idx)
+                # 배치가 가득 차면 yield
+                if len(current_batch) == self.batch_size:
+                    yield current_batch
+                    current_batch = []
+        # 마지막 배치 처리
+        if current_batch and not self.drop_last:
+            yield current_batch
+    def __len__(self):
+        """전체 배치 수 계산"""
+        total_samples = len(self.dataset)
+        if self.drop_last:
+            return total_samples // self.batch_size
+        else:
+            return (total_samples + self.batch_size - 1) // self.batch_size