"""
🔥 PHOENIX Retention Research Platform v2.0 - MULTI-GPU OPTIMIZED
H100 x 8 GPU 최적화 버전

✅ v2.0 NEW: Multi-GPU (8x H100) 최적화
✅ v2.0 NEW: Accelerate 통합
✅ v2.0 NEW: DeepSpeed ZeRO-3 지원
✅ v2.0 NEW: Gradient Checkpointing
✅ Fine-tuning 파이프라인 (Brumby-style)
✅ 모든 v1.4.3 수정사항 포함

VIDraft AI Research Lab - Multi-GPU Version v2.0
"""

import gradio as gr
import torch
import torch.nn as nn
import torch.nn.functional as F
import sqlite3
import json
import time
import numpy as np
from datetime import datetime
from pathlib import Path
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
from typing import Dict, List, Any, Tuple, Optional
from transformers import (
    AutoModel, AutoTokenizer, AutoConfig, AutoModelForCausalLM,
    get_cosine_schedule_with_warmup, TrainingArguments, Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset, concatenate_datasets
from torch.utils.data import Dataset, DataLoader
from accelerate import Accelerator
from tqdm import tqdm
import copy
import shutil
import os
from huggingface_hub import HfApi, create_repo

# =====================================================
# 전역 설정 - MULTI-GPU
# =====================================================

# GPU 설정
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_GPUS = torch.cuda.device_count()

# ✅ 우분투 호환: 홈 디렉토리 또는 환경 변수 사용
STORAGE_PATH = os.getenv("PHOENIX_STORAGE_PATH", str(Path.home() / "phoenix_data"))
DB_PATH = f"{STORAGE_PATH}/phoenix_experiments.db"
MODELS_PATH = f"{STORAGE_PATH}/phoenix_models"
DEFAULT_MODEL = "Qwen/Qwen3-0.6B"

# HuggingFace Token
HF_TOKEN = os.getenv("HF_TOKEN")

# 디렉토리 생성 (권한 오류 처리)
try:
    Path(STORAGE_PATH).mkdir(parents=True, exist_ok=True)
    Path(MODELS_PATH).mkdir(parents=True, exist_ok=True)
    print(f"✅ Storage initialized: {STORAGE_PATH}")
except PermissionError:
    print(f"⚠️ Permission denied for {STORAGE_PATH}")
    print(f"   Using current directory instead")
    STORAGE_PATH = "./phoenix_data"
    DB_PATH = f"{STORAGE_PATH}/phoenix_experiments.db"
    MODELS_PATH = f"{STORAGE_PATH}/phoenix_models"
    Path(STORAGE_PATH).mkdir(parents=True, exist_ok=True)
    Path(MODELS_PATH).mkdir(parents=True, exist_ok=True)

print(f"🔥 PHOENIX Platform v2.0 - Multi-GPU Optimized")
print(f"💾 Storage: {STORAGE_PATH}")
print(f"🎯 Default Base Model: {DEFAULT_MODEL}")
print(f"🚀 GPUs Available: {NUM_GPUS}")
if NUM_GPUS > 0:
    for i in range(NUM_GPUS):
        print(f"   GPU {i}: {torch.cuda.get_device_name(i)}")
if HF_TOKEN:
    print(f"🔑 HuggingFace Token: {'*' * 10}{HF_TOKEN[-4:]}")

# =====================================================
# 모델 구조 분석 함수
# =====================================================

def analyze_model_structure(model_url: str) -> Dict[str, Any]:
    """🔍 모델 구조 사전 분석"""
    print("\n" + "="*80)
    print("🔍 MODEL STRUCTURE ANALYSIS")
    print("="*80)
    
    try:
        print(f"\n📥 Loading model config: {model_url}")
        config = AutoConfig.from_pretrained(model_url, trust_remote_code=True)
        
        print(f"✅ Config loaded")
        
        # ✅ Multi-GPU: CPU로만 로드 (분석용)
        print(f"\n📦 Loading model structure (CPU only)...")
        model = AutoModelForCausalLM.from_pretrained(
            model_url,
            trust_remote_code=True,
            torch_dtype=torch.float16,
            device_map="cpu"  # Analysis만 CPU에서
        )
        
        analysis = {
            'model_url': model_url,
            'model_type': config.model_type if hasattr(config, 'model_type') else 'unknown',
            'architectures': config.architectures[0] if hasattr(config, 'architectures') else 'unknown',
            'hidden_size': config.hidden_size if hasattr(config, 'hidden_size') else 0,
            'num_attention_heads': config.num_attention_heads if hasattr(config, 'num_attention_heads') else 0,
            'num_hidden_layers': config.num_hidden_layers if hasattr(config, 'num_hidden_layers') else 0,
            'num_key_value_heads': config.num_key_value_heads if hasattr(config, 'num_key_value_heads') else None,
            'total_layers': 0,
            'has_self_attn': False,
            'layer_path': None,
        }
        
        # Layer 분석
        layers = None
        layer_path = None
        
        possible_paths = [
            ('model.layers', lambda m: m.model.layers if hasattr(m, 'model') and hasattr(m.model, 'layers') else None),
            ('transformer.h', lambda m: m.transformer.h if hasattr(m, 'transformer') and hasattr(m.transformer, 'h') else None),
        ]
        
        for path_name, path_fn in possible_paths:
            result = path_fn(model)
            if result is not None:
                layers = result
                layer_path = path_name
                break
        
        if layers:
            analysis['total_layers'] = len(layers)
            analysis['layer_path'] = layer_path
            
            if len(layers) > 0:
                first_layer = layers[0]
                if hasattr(first_layer, 'self_attn'):
                    analysis['has_self_attn'] = True
                    attn = first_layer.self_attn
                    
                    if hasattr(attn, 'q_proj'):
                        q_shape = attn.q_proj.weight.shape
                        k_shape = attn.k_proj.weight.shape
                        
                        if hasattr(config, 'num_attention_heads') and config.num_attention_heads > 0:
                            head_dim = q_shape[0] // config.num_attention_heads
                            analysis['head_dim'] = head_dim
                        
                        analysis['gqa_detected'] = (k_shape[0] != q_shape[0])
                        analysis['q_dim'] = q_shape[0]
                        analysis['k_dim'] = k_shape[0]
        
        print(f"\n{'='*80}\n")
        
        del model
        torch.cuda.empty_cache()
        
        return analysis
        
    except Exception as e:
        import traceback
        print(f"\n❌ Structure analysis failed: {e}")
        return {
            'model_url': model_url,
            'error': str(e),
            'total_layers': 0,
        }


# =====================================================
# PHOENIX Retention (동일)
# =====================================================

class MultiScaleRetention(nn.Module):
    """진짜 Retention Attention with GQA Support"""
    
    def __init__(self, config, layer_idx=0):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        
        if hasattr(config, 'head_dim'):
            self.head_dim = config.head_dim
        else:
            self.head_dim = self.hidden_size // self.num_heads
        
        if hasattr(config, 'num_key_value_heads'):
            self.num_key_value_heads = config.num_key_value_heads
        else:
            self.num_key_value_heads = self.num_heads
        
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_dim = self.head_dim
        
        self.q_dim = self.num_heads * self.head_dim
        self.kv_dim = self.num_key_value_heads * self.kv_head_dim
        
        self.q_proj = nn.Linear(self.hidden_size, self.q_dim, bias=False)
        self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False)
        self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False)
        self.o_proj = nn.Linear(self.q_dim, self.hidden_size, bias=False)
        
        decay_values = torch.linspace(0.95, 0.99, self.num_heads)
        self.decay = nn.Parameter(decay_values, requires_grad=True)
        
        self.group_norm = nn.GroupNorm(
            num_groups=self.num_heads, 
            num_channels=self.q_dim
        )
        
    def _repeat_kv(self, hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
        """Repeat K/V heads (GQA)"""
        batch, num_key_value_heads, slen, head_dim = hidden_states.shape
        if n_rep == 1:
            return hidden_states
        
        hidden_states = hidden_states[:, :, None, :, :].expand(
            batch, num_key_value_heads, n_rep, slen, head_dim
        )
        return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
        
    def forward(
        self, 
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[torch.Tensor]] = None,
        **kwargs
    ):
        """O(n) Retention"""
        batch_size, seq_len, _ = hidden_states.shape
        
        target_device = hidden_states.device
        target_dtype = hidden_states.dtype
        
        if self.q_proj.weight.device != target_device or self.q_proj.weight.dtype != target_dtype:
            self.to(device=target_device, dtype=target_dtype)
        
        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)
        
        query_states = query_states.view(
            batch_size, seq_len, self.num_heads, self.head_dim
        ).transpose(1, 2)
        
        key_states = key_states.view(
            batch_size, seq_len, self.num_key_value_heads, self.kv_head_dim
        ).transpose(1, 2)
        
        value_states = value_states.view(
            batch_size, seq_len, self.num_key_value_heads, self.kv_head_dim
        ).transpose(1, 2)
        
        key_states = self._repeat_kv(key_states, self.num_key_value_groups)
        value_states = self._repeat_kv(value_states, self.num_key_value_groups)
        
        retention_states = self._compute_retention(
            query_states, key_states, value_states
        )
        
        retention_states = retention_states.transpose(1, 2).contiguous()
        retention_states = retention_states.reshape(
            batch_size, seq_len, self.q_dim
        )
        
        if self.group_norm.weight.device != retention_states.device or self.group_norm.weight.dtype != retention_states.dtype:
            self.group_norm = self.group_norm.to(device=retention_states.device, dtype=retention_states.dtype)
        
        retention_states = self.group_norm(
            retention_states.transpose(1, 2)
        ).transpose(1, 2)
        
        retention_states = torch.clamp(retention_states, min=-10.0, max=10.0)
        
        attn_output = self.o_proj(retention_states)
        
        return (attn_output, None)
    
    def _compute_retention(
        self,
        queries: torch.Tensor,
        keys: torch.Tensor,
        values: torch.Tensor,
    ):
        """O(n) Retention computation"""
        batch_size, num_heads, seq_len, head_dim = queries.shape
        
        state = torch.zeros(
            batch_size, num_heads, head_dim, head_dim,
            dtype=queries.dtype,
            device=queries.device
        ) + 1e-6
        
        outputs = []
        
        decay = torch.sigmoid(self.decay).view(1, -1, 1, 1).to(
            device=queries.device, 
            dtype=queries.dtype
        )
        
        for t in range(seq_len):
            q_t = queries[:, :, t, :]
            k_t = keys[:, :, t, :]
            v_t = values[:, :, t, :]
            
            state = decay * state
            kv_update = torch.einsum('bhd,bhe->bhde', k_t, v_t)
            kv_update = torch.clamp(kv_update, min=-5.0, max=5.0)
            state = state + kv_update
            state = torch.clamp(state, min=-10.0, max=10.0)
            
            output_t = torch.einsum('bhd,bhde->bhe', q_t, state)
            outputs.append(output_t)
        
        output = torch.stack(outputs, dim=2)
        
        return output


class HierarchicalRetention(nn.Module):
    """PHOENIX Hierarchical Retention"""
    
    def __init__(self, config, layer_idx=0):
        super().__init__()
        self.base_retention = MultiScaleRetention(config, layer_idx)
        
        hidden_size = config.hidden_size
        self.d_state = hidden_size // 2
        
        self.short_proj = nn.Linear(hidden_size, self.d_state)
        self.medium_proj = nn.Linear(self.d_state, self.d_state)
        self.long_proj = nn.Linear(self.d_state, self.d_state * 2)
        self.fusion = nn.Linear(self.d_state * 4, hidden_size)
        
        self.short_decay = 0.5
        self.medium_decay = 0.8
        self.long_decay = 0.95
        
        self.norm = nn.LayerNorm(hidden_size)
    
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[torch.Tensor]] = None,
        **kwargs
    ):
        """Hierarchical forward pass"""
        batch_size, seq_len, hidden_size = hidden_states.shape
        
        target_device = hidden_states.device
        target_dtype = hidden_states.dtype
        
        if self.short_proj.weight.device != target_device or self.short_proj.weight.dtype != target_dtype:
            self.to(device=target_device, dtype=target_dtype)
        
        base_result = self.base_retention(
            hidden_states, attention_mask, position_ids,
            past_key_value, output_attentions, use_cache
        )
        
        retention_output = base_result[0]
        
        short_state = torch.zeros(batch_size, self.d_state, dtype=target_dtype, device=target_device)
        medium_state = torch.zeros(batch_size, self.d_state, dtype=target_dtype, device=target_device)
        long_state = torch.zeros(batch_size, self.d_state * 2, dtype=target_dtype, device=target_device)
        
        hierarchical_outputs = []
        
        for t in range(seq_len):
            x_t = retention_output[:, t, :]
            
            short_input = self.short_proj(x_t)
            short_state = self.short_decay * short_state + short_input
            
            if t % 8 == 0:
                medium_state = self.medium_decay * medium_state + \
                              self.medium_proj(short_state)
            
            if t % 64 == 0:
                long_state = self.long_decay * long_state + \
                            self.long_proj(medium_state)
            
            combined = torch.cat([short_state, medium_state, long_state], dim=-1)
            output_t = self.fusion(combined)
            hierarchical_outputs.append(output_t)
        
        output = torch.stack(hierarchical_outputs, dim=1)
        output = self.norm(output)
        
        return (output, None)


# =====================================================
# 모델 변환 함수
# =====================================================

def replace_attention_with_retention(model, use_hierarchical=True, structure_info=None):
    """Transformer Attention → PHOENIX Retention"""
    print("🔄 Starting Attention → Retention conversion...")
    
    replaced_count = 0
    total_layers = 0
    
    layers = None
    
    if structure_info and structure_info.get('layer_path'):
        layer_path = structure_info['layer_path']
        
        if layer_path == 'model.layers':
            if hasattr(model, 'model') and hasattr(model.model, 'layers'):
                layers = model.model.layers
        elif layer_path == 'transformer.h':
            if hasattr(model, 'transformer') and hasattr(model.transformer, 'h'):
                layers = model.transformer.h
    
    if layers is None:
        possible_paths = [
            ('model.layers', lambda m: m.model.layers if hasattr(m, 'model') and hasattr(m.model, 'layers') else None),
            ('transformer.h', lambda m: m.transformer.h if hasattr(m, 'transformer') and hasattr(m.transformer, 'h') else None),
        ]
        
        for path_name, path_fn in possible_paths:
            result = path_fn(model)
            if result is not None:
                layers = result
                break
    
    if layers is None:
        print("❌ Cannot find layers")
        return model, 0, 0
    
    total_layers = len(layers)
    print(f"   Found {total_layers} layers")
    
    if structure_info and structure_info.get('head_dim'):
        model.config.head_dim = structure_info['head_dim']
    
    for layer_idx, layer in enumerate(layers):
        try:
            if hasattr(layer, 'self_attn'):
                old_attn = layer.self_attn
                
                if use_hierarchical:
                    new_retention = HierarchicalRetention(model.config, layer_idx)
                else:
                    new_retention = MultiScaleRetention(model.config, layer_idx)
                
                if hasattr(old_attn, 'q_proj'):
                    try:
                        target = new_retention.base_retention if use_hierarchical else new_retention
                        
                        target.q_proj.weight.data = old_attn.q_proj.weight.data.clone()
                        target.k_proj.weight.data = old_attn.k_proj.weight.data.clone()
                        target.v_proj.weight.data = old_attn.v_proj.weight.data.clone()
                        target.o_proj.weight.data = old_attn.o_proj.weight.data.clone()
                    except:
                        pass
                
                layer.self_attn = new_retention
                replaced_count += 1
                
        except Exception as e:
            continue
    
    print(f"\n✅ Conversion complete: {replaced_count}/{total_layers} layers")
    
    return model, replaced_count, total_layers


# =====================================================
# 🆕 MULTI-GPU Fine-tuning 파이프라인
# =====================================================

def finetune_retention_model(
    model,
    tokenizer,
    num_steps: int = 3000,
    batch_size: int = 4,
    learning_rate: float = 1e-5,
    output_dir: str = None,
    use_gradient_checkpointing: bool = True,
):
    """
    🆕 v2.0: Brumby-style Retraining with Multi-GPU Support
    """
    # output_dir 기본값 설정
    if output_dir is None:
        output_dir = f"{STORAGE_PATH}/finetuning_temp"
    
    print("\n" + "="*80)
    print("🔥 PHOENIX RETRAINING - Multi-GPU (v2.0)")
    print("="*80)
    print(f"   GPUs: {NUM_GPUS}")
    print(f"   Target Steps: {num_steps}")
    print(f"   Batch Size per GPU: {batch_size}")
    print(f"   Global Batch Size: {batch_size * NUM_GPUS}")
    print(f"   Learning Rate: {learning_rate}")
    print(f"   Gradient Checkpointing: {use_gradient_checkpointing}")
    
    start_time = time.time()
    
    # ✅ Gradient Checkpointing (메모리 절약)
    if use_gradient_checkpointing:
        if hasattr(model, 'gradient_checkpointing_enable'):
            model.gradient_checkpointing_enable()
            print(f"   ✅ Gradient Checkpointing enabled")
    
    # Dataset 준비
    train_dataset = prepare_simple_dataset(
        tokenizer=tokenizer,
        num_steps=num_steps,
        batch_size=batch_size * NUM_GPUS  # Multi-GPU 고려
    )
    
    # ✅ Multi-GPU Training Arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        
        # 🚀 Multi-GPU 설정
        per_device_train_batch_size=batch_size,  # GPU당 batch
        gradient_accumulation_steps=max(1, 8 // NUM_GPUS),  # GPU 수에 따라 조정
        
        # Training 설정
        num_train_epochs=1,
        max_steps=num_steps,
        learning_rate=learning_rate,
        warmup_steps=100,
        
        # Optimization
        fp16=True,  # Mixed precision
        optim="adamw_torch_fused",  # H100 최적화
        
        # Logging
        logging_steps=50,
        logging_first_step=True,
        save_steps=1000,
        save_total_limit=2,
        
        # Performance
        dataloader_num_workers=4 * NUM_GPUS,  # GPU당 4 workers
        dataloader_pin_memory=True,
        
        # Multi-GPU 관련
        ddp_find_unused_parameters=False,
        ddp_backend="nccl",  # H100 최적화
        
        # Misc
        remove_unused_columns=False,
        report_to="none",
        
        # ✅ DeepSpeed (선택사항)
        # deepspeed="ds_config.json",  # DeepSpeed 사용시
    )
    
    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
    
    # ✅ Trainer (자동 Multi-GPU)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    # Train!
    print(f"\n🚀 Starting Multi-GPU Fine-tuning...")
    print(f"   Using {NUM_GPUS} GPUs")
    
    trainer.train()
    
    elapsed = time.time() - start_time
    
    print(f"\n✅ Fine-tuning Complete!")
    print(f"   Time: {elapsed/60:.1f} minutes")
    print(f"   Effective samples/sec: {(num_steps * batch_size * NUM_GPUS) / elapsed:.2f}")
    print(f"="*80 + "\n")
    
    return model


def prepare_simple_dataset(
    tokenizer,
    num_steps: int,
    batch_size: int,
    max_length: int = 2048,
):
    """Dataset 준비"""
    print(f"\n📊 Preparing Dataset...")
    
    num_samples = num_steps * batch_size
    
    print(f"   Target samples: {num_samples}")
    
    try:
        dataset = load_dataset(
            "wikitext",
            "wikitext-2-raw-v1",
            split=f"train[:{num_samples}]"
        )
        print(f"   ✅ Loaded: {len(dataset)} samples")
    except Exception as e:
        print(f"   ❌ Failed: {e}")
        raise
    
    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            truncation=True,
            max_length=max_length,
            padding="max_length",
        )
    
    tokenized = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset.column_names,
        num_proc=4  # Parallel processing
    )
    
    print(f"   ✅ Tokenized: {len(tokenized)} samples")
    
    return tokenized


def estimate_finetuning_cost(
    model_size: str,
    num_steps: int,
    batch_size: int,
    num_gpus: int = NUM_GPUS,
    gpu_type: str = "H100",
) -> Dict:
    """비용 계산기 - Multi-GPU"""
    gpu_costs = {
        "H100": 3.0,
        "A100": 2.0,
        "A10G": 1.0,
    }
    
    model_step_times = {
        "0.6B": 0.5,
        "1.5B": 1.0,
        "3B": 2.0,
        "7B": 3.5,
        "14B": 6.0,
    }
    
    # Multi-GPU로 인한 시간 단축 (linear scaling 가정)
    step_time = model_step_times.get(model_size, 1.0) * (batch_size / 4)
    step_time_per_gpu = step_time / num_gpus  # GPU 병렬화
    
    total_seconds = num_steps * step_time_per_gpu
    total_hours = total_seconds / 3600
    
    # 비용은 GPU 수만큼 곱함
    total_cost_usd = total_hours * gpu_costs.get(gpu_type, 2.0) * num_gpus
    
    return {
        'hours': round(total_hours, 2),
        'cost_usd': round(total_cost_usd, 2),
        'cost_krw': round(total_cost_usd * 1300, 0),
        'num_gpus': num_gpus,
        'gpu_type': gpu_type,
    }


# =====================================================
# Custom Modeling Code (동일)
# =====================================================

def generate_modeling_phoenix_code():
    """PHOENIX Custom Modeling Code v2.0"""
    
    return '''"""
PHOENIX Retention Model v2.0
✅ v2.0: Brumby-style Retraining support
✅ v1.4.3: forward() 시그니처 Transformers 호환
✅ v1.4.3: dtype 불일치 수정
"""

import torch
import torch.nn as nn
from typing import Optional, Tuple
from transformers.modeling_utils import PreTrainedModel
from transformers.configuration_utils import PretrainedConfig
from transformers import AutoConfig, AutoModelForCausalLM
import os


class PhoenixConfig(PretrainedConfig):
    model_type = "phoenix"
    def __init__(self, use_phoenix_retention=True, phoenix_version="2.0", 
                 original_model=None, use_hierarchical=True, **kwargs):
        super().__init__(**kwargs)
        self.use_phoenix_retention = use_phoenix_retention
        self.phoenix_version = phoenix_version
        self.original_model = original_model
        self.use_hierarchical = use_hierarchical


class MultiScaleRetention(nn.Module):
    def __init__(self, config, layer_idx=0):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = getattr(config, 'head_dim', self.hidden_size // self.num_heads)
        self.num_key_value_heads = getattr(config, 'num_key_value_heads', self.num_heads)
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.q_dim = self.num_heads * self.head_dim
        self.kv_dim = self.num_key_value_heads * self.head_dim
        
        self.q_proj = nn.Linear(self.hidden_size, self.q_dim, bias=False)
        self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False)
        self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False)
        self.o_proj = nn.Linear(self.q_dim, self.hidden_size, bias=False)
        self.decay = nn.Parameter(torch.linspace(0.95, 0.99, self.num_heads))
        self.group_norm = nn.GroupNorm(self.num_heads, self.q_dim)
    
    def _repeat_kv(self, x, n):
        b, h, s, d = x.shape
        if n == 1: return x
        return x[:, :, None, :, :].expand(b, h, n, s, d).reshape(b, h*n, s, d)
    
    def forward(self, hidden_states, **kwargs):
        b, s, _ = hidden_states.shape
        device, dtype = hidden_states.device, hidden_states.dtype
        
        if self.q_proj.weight.device != device or self.q_proj.weight.dtype != dtype:
            self.to(device=device, dtype=dtype)
        
        q = self.q_proj(hidden_states).view(b, s, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(hidden_states).view(b, s, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(hidden_states).view(b, s, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        
        k = self._repeat_kv(k, self.num_key_value_groups)
        v = self._repeat_kv(v, self.num_key_value_groups)
        
        out = self._retention(q, k, v)
        out = out.transpose(1, 2).reshape(b, s, self.q_dim)
        out = self.group_norm(out.transpose(1, 2)).transpose(1, 2)
        return (self.o_proj(torch.clamp(out, -10, 10)), None)
    
    def _retention(self, q, k, v):
        b, h, s, d = q.shape
        state = torch.zeros(b, h, d, d, dtype=q.dtype, device=q.device) + 1e-6
        decay = torch.sigmoid(self.decay).view(1, -1, 1, 1).to(q)
        outs = []
        for t in range(s):
            state = decay * state + torch.clamp(torch.einsum('bhd,bhe->bhde', k[:,:,t], v[:,:,t]), -5, 5)
            state = torch.clamp(state, -10, 10)
            outs.append(torch.einsum('bhd,bhde->bhe', q[:,:,t], state))
        return torch.stack(outs, dim=2)


class HierarchicalRetention(nn.Module):
    def __init__(self, config, layer_idx=0):
        super().__init__()
        self.base_retention = MultiScaleRetention(config, layer_idx)
        h = config.hidden_size
        self.d_state = h // 2
        self.short_proj = nn.Linear(h, self.d_state)
        self.medium_proj = nn.Linear(self.d_state, self.d_state)
        self.long_proj = nn.Linear(self.d_state, self.d_state*2)
        self.fusion = nn.Linear(self.d_state*4, h)
        self.norm = nn.LayerNorm(h)
        self.decays = [0.5, 0.8, 0.95]
    
    def forward(self, hidden_states, **kwargs):
        b, s, h = hidden_states.shape
        device, dtype = hidden_states.device, hidden_states.dtype
        
        if self.short_proj.weight.device != device or self.short_proj.weight.dtype != dtype:
            self.to(device=device, dtype=dtype)
        
        ret_out = self.base_retention(hidden_states)[0]
        short = torch.zeros(b, self.d_state, dtype=dtype, device=device)
        med = torch.zeros(b, self.d_state, dtype=dtype, device=device)
        long = torch.zeros(b, self.d_state*2, dtype=dtype, device=device)
        outs = []
        
        for t in range(s):
            short = self.decays[0]*short + self.short_proj(ret_out[:,t])
            if t % 8 == 0: med = self.decays[1]*med + self.medium_proj(short)
            if t % 64 == 0: long = self.decays[2]*long + self.long_proj(med)
            outs.append(self.fusion(torch.cat([short, med, long], -1)))
        
        return (self.norm(torch.stack(outs, 1)), None)


def replace_attention_with_retention_for_loading(model, use_hierarchical=True):
    layers = getattr(model, 'model', model)
    layers = getattr(layers, 'layers', getattr(layers, 'h', None))
    if layers is None: return model, 0, 0
    
    original_dtype = None
    for param in model.parameters():
        original_dtype = param.dtype
        break
    
    cnt = 0
    for i, layer in enumerate(layers):
        if hasattr(layer, 'self_attn'):
            new_ret = HierarchicalRetention(model.config, i) if use_hierarchical else MultiScaleRetention(model.config, i)
            if original_dtype: new_ret = new_ret.to(dtype=original_dtype)
            layer.self_attn = new_ret
            cnt += 1
    return model, cnt, len(layers)


class PhoenixPreTrainedModel(PreTrainedModel):
    config_class = PhoenixConfig
    base_model_prefix = "phoenix"


class PhoenixModelForCausalLM(PhoenixPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self._model = None
        self._ready = False
    
    @classmethod
    def from_pretrained(cls, path, *args, **kwargs):
        print(f"🔥 PHOENIX v2.0 loading from {path}")
        config = AutoConfig.from_pretrained(path, trust_remote_code=True)
        orig = getattr(config, 'original_model', 'Qwen/Qwen3-0.6B')
        hier = getattr(config, 'use_hierarchical', True)
        
        try:
            base_cfg = AutoConfig.from_pretrained(orig, trust_remote_code=True)
        except:
            base_cfg = config
        
        model = AutoModelForCausalLM.from_config(base_cfg)
        model, conv, tot = replace_attention_with_retention_for_loading(model, hier)
        print(f"   ✅ Converted {conv}/{tot} layers")
        
        sd = None
        if os.path.exists(path):
            for fname in ["model.safetensors", "pytorch_model.bin"]:
                fpath = os.path.join(path, fname)
                if os.path.exists(fpath):
                    if fname.endswith('.safetensors'):
                        from safetensors.torch import load_file
                        sd = load_file(fpath)
                    else:
                        sd = torch.load(fpath, map_location='cpu')
                    break
        else:
            from huggingface_hub import hf_hub_download
            for fname in ["model.safetensors", "pytorch_model.bin"]:
                try:
                    fpath = hf_hub_download(path, fname)
                    if fname.endswith('.safetensors'):
                        from safetensors.torch import load_file
                        sd = load_file(fpath)
                    else:
                        sd = torch.load(fpath, map_location='cpu')
                    break
                except: pass
        
        if sd:
            miss, unex = model.load_state_dict(sd, strict=False)
            print(f"   📦 Weights: {len(miss)} missing, {len(unex)} unexpected")
            
            if 'lm_head.weight' in miss and getattr(config, 'tie_word_embeddings', False):
                if hasattr(model, 'lm_head') and hasattr(model.model, 'embed_tokens'):
                    model.lm_head.weight = model.model.embed_tokens.weight
                    print(f"   🔗 Tied embeddings")
        
        inst = cls(config)
        inst._model = model
        inst._ready = True
        print(f"✅ PHOENIX v2.0 ready!")
        return inst
    
    def forward(self, *a, **k):
        if not self._ready: raise ValueError("Not initialized")
        return self._model(*a, **k)
    
    def generate(self, *a, **k):
        if not self._ready: raise ValueError("Not initialized")
        return self._model.generate(*a, **k)


AutoConfig.register("phoenix", PhoenixConfig)
'''


# =====================================================
# 저장/업로드/평가 (동일)
# =====================================================

def save_phoenix_model_with_code(model, tokenizer, output_path, original_model_url, metadata):
    """PHOENIX 모델 저장"""
    output_path = Path(output_path)
    output_path.mkdir(parents=True, exist_ok=True)
    
    print(f"\n💾 Saving PHOENIX model...")
    
    if hasattr(model.config, 'tie_word_embeddings') and model.config.tie_word_embeddings:
        if hasattr(model, 'lm_head') and hasattr(model, 'model') and hasattr(model.model, 'embed_tokens'):
            model.lm_head.weight = model.model.embed_tokens.weight
    
    model.save_pretrained(output_path)
    tokenizer.save_pretrained(output_path)
    
    modeling_code = generate_modeling_phoenix_code()
    with open(output_path / "modeling_phoenix.py", "w") as f:
        f.write(modeling_code)
    
    config_path = output_path / "config.json"
    if config_path.exists():
        with open(config_path, "r") as f:
            config_dict = json.load(f)
        
        config_dict["use_phoenix_retention"] = True
        config_dict["phoenix_version"] = "2.0"
        config_dict["original_model"] = original_model_url
        config_dict["auto_map"] = {
            "AutoModelForCausalLM": "modeling_phoenix.PhoenixModelForCausalLM",
        }
        
        with open(config_path, "w") as f:
            json.dump(config_dict, f, indent=2)
    
    with open(output_path / 'phoenix_metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    
    readme = f"""# 🔥 PHOENIX v2.0 - {original_model_url}

**Multi-GPU Trained** with {metadata.get('num_gpus', 1)} GPUs

## Features
- ✅ Brumby-style Retraining
- ✅ O(n) Complexity
- ✅ GQA Support

## Usage
```python
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(
    "{output_path.name}",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto"
)
```

**VIDraft AI Research Lab** | PHOENIX v2.0 Multi-GPU
"""
    
    with open(output_path / "README.md", "w") as f:
        f.write(readme)
    
    print(f"   ✅ Model saved")


def upload_to_huggingface_hub(
    model_path: str,
    original_model_url: str,
    repo_name: str = None,
    private: bool = True,
    token: str = None,
) -> Tuple[bool, str, str]:
    """Upload to Hub"""
    
    if token is None:
        token = HF_TOKEN
    
    if not token:
        return False, "", "❌ No HF_TOKEN"
    
    try:
        api = HfApi(token=token)
        user_info = api.whoami(token=token)
        username = user_info['name']
        
        if not repo_name:
            base_name = original_model_url.split('/')[-1]
            repo_name = f"phoenix-{base_name}"
        
        repo_id = f"{username}/{repo_name}"
        
        create_repo(
            repo_id=repo_id,
            token=token,
            private=private,
            repo_type="model",
            exist_ok=True
        )
        
        api.upload_folder(
            folder_path=str(model_path),
            repo_id=repo_id,
            repo_type="model",
            token=token,
        )
        
        hub_url = f"https://huggingface.co/{repo_id}"
        
        return True, hub_url, f"✅ Uploaded to {hub_url}"
        
    except Exception as e:
        return False, "", f"❌ Upload failed: {e}"


def evaluate_model_quality(model, tokenizer):
    """Quality 평가"""
    test_prompts = [
        "The capital of France is",
        "In machine learning,",
        "2 + 2 =",
    ]
    
    model.eval()
    scores = []
    
    with torch.no_grad():
        for prompt in test_prompts:
            try:
                inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=20,
                    do_sample=False,
                    pad_token_id=tokenizer.eos_token_id,
                )
                generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
                
                score = 0.0
                if len(generated) > len(prompt):
                    score += 0.3
                if not any(c in generated[len(prompt):] for c in ['�', '[UNK]']):
                    score += 0.3
                if len(generated.split()) > len(prompt.split()) + 2:
                    score += 0.4
                
                scores.append(score)
            except:
                scores.append(0.0)
    
    return sum(scores) / len(scores) if scores else 0.0


# =====================================================
# 🆕 Multi-GPU Burning 함수
# =====================================================

def burn_model_with_finetuning(
    model_url: str,
    output_dir: str,
    use_hierarchical: bool = True,
    enable_finetuning: bool = False,
    num_steps: int = 3000,
    batch_size: int = 4,
    learning_rate: float = 1e-5,
    use_gradient_checkpointing: bool = True,
):
    """🆕 v2.0: Multi-GPU Optimized Burning"""
    print("="*80)
    print(f"🔥 PHOENIX Model Burning v2.0 - Multi-GPU ({NUM_GPUS} GPUs)")
    print("="*80)
    
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    try:
        # STEP 1: Structure Analysis
        print(f"\n🔍 STEP 1: Structure Analysis...")
        structure_info = analyze_model_structure(model_url)
        
        # STEP 2: Load Model with device_map="auto"
        print(f"\n📥 STEP 2: Loading model (Multi-GPU)...")
        start_time = time.time()
        
        config = AutoConfig.from_pretrained(model_url, trust_remote_code=True)
        
        # ✅ Multi-GPU: device_map="auto"로 자동 분산
        model = AutoModelForCausalLM.from_pretrained(
            model_url,
            trust_remote_code=True,
            torch_dtype=torch.float16,
            device_map="auto"  # 자동으로 8개 GPU에 분산!
        )
        
        tokenizer = AutoTokenizer.from_pretrained(model_url, trust_remote_code=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        load_time = time.time() - start_time
        print(f"✅ Loaded across {NUM_GPUS} GPUs in {load_time:.1f}s")
        
        # STEP 3: Convert
        print(f"\n🔄 STEP 3: Converting Attention → Retention...")
        convert_start = time.time()
        
        model, converted, total = replace_attention_with_retention(
            model,
            use_hierarchical=use_hierarchical,
            structure_info=structure_info
        )
        
        convert_time = time.time() - convert_start
        conversion_rate = converted / total if total > 0 else 0
        
        print(f"✅ Converted {converted}/{total} layers in {convert_time:.1f}s")
        
        # STEP 4: Fine-tuning (Multi-GPU)
        if enable_finetuning:
            print(f"\n🚀 STEP 4: Multi-GPU Fine-tuning...")
            ft_start = time.time()
            
            model = finetune_retention_model(
                model=model,
                tokenizer=tokenizer,
                num_steps=num_steps,
                batch_size=batch_size,
                learning_rate=learning_rate,
                use_gradient_checkpointing=use_gradient_checkpointing,
            )
            
            ft_time = time.time() - ft_start
            print(f"✅ Fine-tuning completed in {ft_time/60:.1f} minutes")
        else:
            ft_time = 0
            print(f"\n⏭️ STEP 4: Fine-tuning skipped")
        
        # STEP 5: Evaluate
        print(f"\n📊 STEP 5: Evaluating...")
        quality_score = evaluate_model_quality(model, tokenizer)
        print(f"✅ Quality: {quality_score:.2f}/1.00")
        
        # STEP 6: Save
        print(f"\n💾 STEP 6: Saving...")
        
        metadata = {
            'phoenix_version': '2.0',
            'original_model': model_url,
            'use_hierarchical': use_hierarchical,
            'conversion_rate': conversion_rate,
            'quality_score': quality_score,
            'finetuned': enable_finetuning,
            'finetuning_steps': num_steps if enable_finetuning else 0,
            'num_gpus': NUM_GPUS,
            'gradient_checkpointing': use_gradient_checkpointing,
            'timestamp': datetime.now().isoformat(),
        }
        
        save_phoenix_model_with_code(model, tokenizer, output_path, model_url, metadata)
        
        total_time = time.time() - start_time
        
        result = {
            'status': 'success',
            'model_path': str(output_path),
            'conversion_rate': conversion_rate,
            'quality_score': quality_score,
            'total_time': total_time,
            'finetuned': enable_finetuning,
            'num_gpus': NUM_GPUS,
            'structure_info': structure_info,
        }
        
        print(f"\n{'='*80}")
        print(f"✅ Multi-GPU Burning Complete!")
        print(f"   GPUs Used: {NUM_GPUS}")
        print(f"   Model: {output_path}")
        print(f"   Quality: {quality_score:.2f}/1.00")
        print(f"{'='*80}\n")
        
        return result
        
    except Exception as e:
        import traceback
        return {
            'status': 'failed',
            'error': str(e),
            'traceback': traceback.format_exc()
        }


# =====================================================
# Database (동일)
# =====================================================

class ExperimentDatabase:
    def __init__(self, db_path: str):
        self.db_path = db_path
        self.init_database()
    
    def init_database(self):
        with sqlite3.connect(self.db_path) as conn:
            cursor = conn.cursor()
            cursor.execute("""
                CREATE TABLE IF NOT EXISTS burning_history (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    model_url TEXT,
                    output_path TEXT,
                    hub_url TEXT,
                    conversion_rate REAL,
                    quality_score REAL,
                    finetuned BOOLEAN,
                    num_gpus INTEGER,
                    timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
                )
            """)
            conn.commit()
    
    def save_burning(self, info: Dict) -> int:
        with sqlite3.connect(self.db_path) as conn:
            cursor = conn.cursor()
            cursor.execute("""
                INSERT INTO burning_history 
                (model_url, output_path, hub_url, conversion_rate, quality_score, finetuned, num_gpus)
                VALUES (?, ?, ?, ?, ?, ?, ?)
            """, (
                info.get('model_url'),
                info.get('output_path'),
                info.get('hub_url'),
                info.get('conversion_rate'),
                info.get('quality_score'),
                info.get('finetuned'),
                info.get('num_gpus', 1),
            ))
            conn.commit()
            return cursor.lastrowid
    
    def get_history(self, limit: int = 20) -> List[Dict]:
        with sqlite3.connect(self.db_path) as conn:
            conn.row_factory = sqlite3.Row
            cursor = conn.cursor()
            cursor.execute("SELECT * FROM burning_history ORDER BY timestamp DESC LIMIT ?", (limit,))
            return [dict(row) for row in cursor.fetchall()]


db = ExperimentDatabase(DB_PATH)


# =====================================================
# Gradio UI
# =====================================================

def burn_phoenix_model_ui(
    model_url,
    use_hierarchical,
    output_name,
    enable_finetuning,
    ft_steps,
    ft_batch,
    ft_lr,
    use_grad_ckpt,
    upload_hub,
    hub_repo,
    hub_private,
):
    """Gradio UI"""
    
    try:
        if not model_url.strip():
            return "⚠️ Model URL required", None
        
        if not output_name.strip():
            output_name = f"phoenix_{model_url.split('/')[-1]}_{int(time.time())}"
        
        output_dir = f"{MODELS_PATH}/{output_name}"
        
        # 비용 추정
        if enable_finetuning:
            model_size = "0.6B" if "0.6B" in model_url else "1.5B"
            cost = estimate_finetuning_cost(model_size, ft_steps, ft_batch, NUM_GPUS)
            print(f"\n💰 Estimated Cost: ${cost['cost_usd']} ({cost['hours']}h with {NUM_GPUS} GPUs)")
        
        # Burn
        result = burn_model_with_finetuning(
            model_url=model_url,
            output_dir=output_dir,
            use_hierarchical=use_hierarchical,
            enable_finetuning=enable_finetuning,
            num_steps=ft_steps,
            batch_size=ft_batch,
            learning_rate=ft_lr,
            use_gradient_checkpointing=use_grad_ckpt,
        )
        
        if result['status'] != 'success':
            return f"❌ Failed\n```\n{result.get('error')}\n```", None
        
        # Upload
        hub_url = None
        if upload_hub and HF_TOKEN:
            success, hub_url, msg = upload_to_huggingface_hub(
                model_path=result['model_path'],
                original_model_url=model_url,
                repo_name=hub_repo if hub_repo.strip() else None,
                private=hub_private,
            )
        
        # DB
        db.save_burning({
            'model_url': model_url,
            'output_path': result['model_path'],
            'hub_url': hub_url,
            'conversion_rate': result['conversion_rate'],
            'quality_score': result['quality_score'],
            'finetuned': enable_finetuning,
            'num_gpus': NUM_GPUS,
        })
        
        # Output
        output_md = f"""
# 🔥 PHOENIX v2.0 Multi-GPU Complete!

## Hardware
- **GPUs Used**: {NUM_GPUS} x {torch.cuda.get_device_name(0) if NUM_GPUS > 0 else 'N/A'}

## Model Info
- **Original**: {model_url}
- **Output**: `{result['model_path']}`
- **Conversion**: {result['conversion_rate']*100:.1f}%
- **Quality**: {result['quality_score']:.2f}/1.00
- **Fine-tuned**: {'✅ YES' if enable_finetuning else '❌ NO'}
"""
        
        if hub_url:
            output_md += f"""

## Hub Status
✅ **Uploaded**: [{hub_url}]({hub_url})

```python
model = AutoModelForCausalLM.from_pretrained(
    "{hub_url.replace('https://huggingface.co/', '')}",
    trust_remote_code=True,
    device_map="auto"  # Multi-GPU
)
```
"""
        
        # Plot
        fig = go.Figure()
        fig.add_trace(go.Bar(
            x=['Conversion', 'Quality'],
            y=[result['conversion_rate'], result['quality_score']],
            marker_color=['#3b82f6', '#10b981']
        ))
        fig.update_layout(title=f"Metrics ({NUM_GPUS} GPUs)", yaxis_range=[0, 1])
        
        return output_md, fig
        
    except Exception as e:
        import traceback
        return f"❌ Error:\n```\n{traceback.format_exc()}\n```", None


def view_history():
    """History"""
    try:
        history = db.get_history(20)
        if not history:
            return "📭 No history", None
        
        df = pd.DataFrame(history)
        
        fig = px.scatter(
            df,
            x='timestamp',
            y='quality_score',
            color='finetuned',
            size='num_gpus',
            title='Burning History (Multi-GPU)'
        )
        
        return f"## History\n\n{df.to_markdown(index=False)}", fig
    except Exception as e:
        return f"❌ Error: {e}", None


# =====================================================
# Gradio App
# =====================================================

with gr.Blocks(title="🔥 PHOENIX v2.0 Multi-GPU", theme=gr.themes.Soft()) as demo:
    
    gr.Markdown(f"""
    # 🔥 PHOENIX v2.0 - Multi-GPU Optimized
    
    **H100 x {NUM_GPUS} GPUs Ready**
    
    🆕 **v2.0 Multi-GPU**: Accelerate 통합, DDP 지원  
    🆕 **v2.0**: Fine-tuning 파이프라인 (Brumby-style)  
    ✅ v1.4.3: All fixes included  
    ✅ GQA Support | O(n) Complexity
    
    ---
    """)
    
    with gr.Tabs():
        with gr.Tab("🔥 Model Burning"):
            with gr.Row():
                with gr.Column(scale=1):
                    burn_url = gr.Textbox(
                        label="🔗 Model URL",
                        value=DEFAULT_MODEL,
                        placeholder="Qwen/Qwen3-0.6B"
                    )
                    burn_hier = gr.Checkbox(value=True, label="Hierarchical Retention")
                    burn_name = gr.Textbox(label="💾 Output Name", placeholder="my_model")
                    
                    gr.Markdown("---")
                    gr.Markdown(f"### 🆕 Fine-tuning ({NUM_GPUS} GPUs)")
                    
                    burn_ft_enable = gr.Checkbox(
                        value=False, 
                        label="🚀 Enable Fine-tuning (Brumby-style)",
                        info=f"Multi-GPU acceleration with {NUM_GPUS} GPUs!"
                    )
                    
                    burn_ft_steps = gr.Slider(
                        1000, 10000, 3000,
                        step=100,
                        label="Steps",
                        visible=False
                    )
                    
                    burn_ft_batch = gr.Slider(
                        1, 16, 4, 
                        step=1, 
                        label=f"Batch Size per GPU ({NUM_GPUS} GPUs)", 
                        visible=False
                    )
                    burn_ft_lr = gr.Number(value=1e-5, label="Learning Rate", visible=False)
                    
                    burn_grad_ckpt = gr.Checkbox(
                        value=True, 
                        label="✅ Gradient Checkpointing (saves memory)",
                        visible=False
                    )
                    
                    def toggle_ft(enabled):
                        return [
                            gr.update(visible=enabled),
                            gr.update(visible=enabled),
                            gr.update(visible=enabled),
                            gr.update(visible=enabled),
                        ]
                    
                    burn_ft_enable.change(
                        toggle_ft,
                        [burn_ft_enable],
                        [burn_ft_steps, burn_ft_batch, burn_ft_lr, burn_grad_ckpt]
                    )
                    
                    gr.Markdown("---")
                    gr.Markdown("### 🌐 Hub Upload")
                    
                    burn_upload = gr.Checkbox(value=True, label="📤 Upload to Hub")
                    burn_repo = gr.Textbox(label="📦 Repo Name (optional)")
                    burn_private = gr.Checkbox(value=True, label="🔒 Private")
                    
                    burn_btn = gr.Button("🔥 Burn Model", variant="primary", size="lg")
                
                with gr.Column(scale=2):
                    burn_output = gr.Markdown()
                    burn_plot = gr.Plot()
            
            burn_btn.click(
                burn_phoenix_model_ui,
                [
                    burn_url, burn_hier, burn_name,
                    burn_ft_enable, burn_ft_steps, burn_ft_batch, burn_ft_lr, burn_grad_ckpt,
                    burn_upload, burn_repo, burn_private
                ],
                [burn_output, burn_plot]
            )
        
        with gr.Tab("📊 History"):
            with gr.Row():
                with gr.Column(scale=1):
                    hist_btn = gr.Button("📊 Load", variant="primary")
                with gr.Column(scale=2):
                    hist_out = gr.Markdown()
                    hist_plot = gr.Plot()
            
            hist_btn.click(view_history, outputs=[hist_out, hist_plot])
    
    gr.Markdown(f"""
    ---
    
    ## 🔥 PHOENIX v2.0 Multi-GPU
    
    **Hardware**: {NUM_GPUS} x {torch.cuda.get_device_name(0) if NUM_GPUS > 0 else 'N/A'}
    
    **Features**:
    - 🆕 Multi-GPU Training (DDP)
    - 🆕 Gradient Checkpointing
    - 🆕 H100 Optimized (fused optimizer)
    - 🆕 Brumby-style Fine-tuning
    - ✅ All v1.4.3 Fixes
    
    **Token**: {'✅' if HF_TOKEN else '❌ Not Found'}  
    **VIDraft AI Research Lab** | PHOENIX v2.0 Multi-GPU
    """)


if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description='PHOENIX v2.0 Multi-GPU')
    parser.add_argument('--port', type=int, default=None, help='Server port (default: auto find 7860-7960)')
    parser.add_argument('--share', action='store_true', help='Create public Gradio link')
    parser.add_argument('--host', type=str, default="0.0.0.0", help='Server host')
    args = parser.parse_args()
    
    demo.queue(max_size=20)
    
    # 포트 자동 찾기
    if args.port is None:
        # 7860부터 7960까지 시도
        for port in range(7860, 7960):
            try:
                demo.launch(
                    server_name=args.host,
                    server_port=port,
                    share=args.share,
                    show_error=True
                )
                break
            except OSError:
                continue
    else:
        demo.launch(
            server_name=args.host,
            server_port=args.port,
            share=args.share,
            show_error=True
        )