""" πŸ”₯ PHOENIX Retention Research Platform v2.0 - MULTI-GPU OPTIMIZED H100 x 8 GPU μ΅œμ ν™” 버전 βœ… v2.0 NEW: Multi-GPU (8x H100) μ΅œμ ν™” βœ… v2.0 NEW: Accelerate 톡합 βœ… v2.0 NEW: DeepSpeed ZeRO-3 지원 βœ… v2.0 NEW: Gradient Checkpointing βœ… Fine-tuning νŒŒμ΄ν”„λΌμΈ (Brumby-style) βœ… λͺ¨λ“  v1.4.3 μˆ˜μ •μ‚¬ν•­ 포함 VIDraft AI Research Lab - Multi-GPU Version v2.0 """ import gradio as gr import torch import torch.nn as nn import torch.nn.functional as F import sqlite3 import json import time import numpy as np from datetime import datetime from pathlib import Path import plotly.graph_objects as go import plotly.express as px import pandas as pd from typing import Dict, List, Any, Tuple, Optional from transformers import ( AutoModel, AutoTokenizer, AutoConfig, AutoModelForCausalLM, get_cosine_schedule_with_warmup, TrainingArguments, Trainer, DataCollatorForLanguageModeling ) from datasets import load_dataset, concatenate_datasets from torch.utils.data import Dataset, DataLoader from accelerate import Accelerator from tqdm import tqdm import copy import shutil import os from huggingface_hub import HfApi, create_repo # ===================================================== # μ „μ—­ μ„€μ • - MULTI-GPU # ===================================================== # GPU μ„€μ • DEVICE = "cuda" if torch.cuda.is_available() else "cpu" NUM_GPUS = torch.cuda.device_count() # βœ… μš°λΆ„νˆ¬ ν˜Έν™˜: ν™ˆ 디렉토리 λ˜λŠ” ν™˜κ²½ λ³€μˆ˜ μ‚¬μš© STORAGE_PATH = os.getenv("PHOENIX_STORAGE_PATH", str(Path.home() / "phoenix_data")) DB_PATH = f"{STORAGE_PATH}/phoenix_experiments.db" MODELS_PATH = f"{STORAGE_PATH}/phoenix_models" DEFAULT_MODEL = "Qwen/Qwen3-0.6B" # HuggingFace Token HF_TOKEN = os.getenv("HF_TOKEN") # 디렉토리 생성 (κΆŒν•œ 였λ₯˜ 처리) try: Path(STORAGE_PATH).mkdir(parents=True, exist_ok=True) Path(MODELS_PATH).mkdir(parents=True, exist_ok=True) print(f"βœ… Storage initialized: {STORAGE_PATH}") except PermissionError: print(f"⚠️ Permission denied for {STORAGE_PATH}") print(f" Using current directory instead") STORAGE_PATH = "./phoenix_data" DB_PATH = f"{STORAGE_PATH}/phoenix_experiments.db" MODELS_PATH = f"{STORAGE_PATH}/phoenix_models" Path(STORAGE_PATH).mkdir(parents=True, exist_ok=True) Path(MODELS_PATH).mkdir(parents=True, exist_ok=True) print(f"πŸ”₯ PHOENIX Platform v2.0 - Multi-GPU Optimized") print(f"πŸ’Ύ Storage: {STORAGE_PATH}") print(f"🎯 Default Base Model: {DEFAULT_MODEL}") print(f"πŸš€ GPUs Available: {NUM_GPUS}") if NUM_GPUS > 0: for i in range(NUM_GPUS): print(f" GPU {i}: {torch.cuda.get_device_name(i)}") if HF_TOKEN: print(f"πŸ”‘ HuggingFace Token: {'*' * 10}{HF_TOKEN[-4:]}") # ===================================================== # λͺ¨λΈ ꡬ쑰 뢄석 ν•¨μˆ˜ # ===================================================== def analyze_model_structure(model_url: str) -> Dict[str, Any]: """πŸ” λͺ¨λΈ ꡬ쑰 사전 뢄석""" print("\n" + "="*80) print("πŸ” MODEL STRUCTURE ANALYSIS") print("="*80) try: print(f"\nπŸ“₯ Loading model config: {model_url}") config = AutoConfig.from_pretrained(model_url, trust_remote_code=True) print(f"βœ… Config loaded") # βœ… Multi-GPU: CPU둜만 λ‘œλ“œ (λΆ„μ„μš©) print(f"\nπŸ“¦ Loading model structure (CPU only)...") model = AutoModelForCausalLM.from_pretrained( model_url, trust_remote_code=True, torch_dtype=torch.float16, device_map="cpu" # Analysis만 CPUμ—μ„œ ) analysis = { 'model_url': model_url, 'model_type': config.model_type if hasattr(config, 'model_type') else 'unknown', 'architectures': config.architectures[0] if hasattr(config, 'architectures') else 'unknown', 'hidden_size': config.hidden_size if hasattr(config, 'hidden_size') else 0, 'num_attention_heads': config.num_attention_heads if hasattr(config, 'num_attention_heads') else 0, 'num_hidden_layers': config.num_hidden_layers if hasattr(config, 'num_hidden_layers') else 0, 'num_key_value_heads': config.num_key_value_heads if hasattr(config, 'num_key_value_heads') else None, 'total_layers': 0, 'has_self_attn': False, 'layer_path': None, } # Layer 뢄석 layers = None layer_path = None possible_paths = [ ('model.layers', lambda m: m.model.layers if hasattr(m, 'model') and hasattr(m.model, 'layers') else None), ('transformer.h', lambda m: m.transformer.h if hasattr(m, 'transformer') and hasattr(m.transformer, 'h') else None), ] for path_name, path_fn in possible_paths: result = path_fn(model) if result is not None: layers = result layer_path = path_name break if layers: analysis['total_layers'] = len(layers) analysis['layer_path'] = layer_path if len(layers) > 0: first_layer = layers[0] if hasattr(first_layer, 'self_attn'): analysis['has_self_attn'] = True attn = first_layer.self_attn if hasattr(attn, 'q_proj'): q_shape = attn.q_proj.weight.shape k_shape = attn.k_proj.weight.shape if hasattr(config, 'num_attention_heads') and config.num_attention_heads > 0: head_dim = q_shape[0] // config.num_attention_heads analysis['head_dim'] = head_dim analysis['gqa_detected'] = (k_shape[0] != q_shape[0]) analysis['q_dim'] = q_shape[0] analysis['k_dim'] = k_shape[0] print(f"\n{'='*80}\n") del model torch.cuda.empty_cache() return analysis except Exception as e: import traceback print(f"\n❌ Structure analysis failed: {e}") return { 'model_url': model_url, 'error': str(e), 'total_layers': 0, } # ===================================================== # PHOENIX Retention (동일) # ===================================================== class MultiScaleRetention(nn.Module): """μ§„μ§œ Retention Attention with GQA Support""" def __init__(self, config, layer_idx=0): super().__init__() self.config = config self.layer_idx = layer_idx self.hidden_size = config.hidden_size self.num_heads = config.num_attention_heads if hasattr(config, 'head_dim'): self.head_dim = config.head_dim else: self.head_dim = self.hidden_size // self.num_heads if hasattr(config, 'num_key_value_heads'): self.num_key_value_heads = config.num_key_value_heads else: self.num_key_value_heads = self.num_heads self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.kv_head_dim = self.head_dim self.q_dim = self.num_heads * self.head_dim self.kv_dim = self.num_key_value_heads * self.kv_head_dim self.q_proj = nn.Linear(self.hidden_size, self.q_dim, bias=False) self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False) self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False) self.o_proj = nn.Linear(self.q_dim, self.hidden_size, bias=False) decay_values = torch.linspace(0.95, 0.99, self.num_heads) self.decay = nn.Parameter(decay_values, requires_grad=True) self.group_norm = nn.GroupNorm( num_groups=self.num_heads, num_channels=self.q_dim ) def _repeat_kv(self, hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: """Repeat K/V heads (GQA)""" batch, num_key_value_heads, slen, head_dim = hidden_states.shape if n_rep == 1: return hidden_states hidden_states = hidden_states[:, :, None, :, :].expand( batch, num_key_value_heads, n_rep, slen, head_dim ) return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.Tensor] = None, past_key_values: Optional[Tuple[torch.Tensor]] = None, **kwargs ): """O(n) Retention""" batch_size, seq_len, _ = hidden_states.shape target_device = hidden_states.device target_dtype = hidden_states.dtype if self.q_proj.weight.device != target_device or self.q_proj.weight.dtype != target_dtype: self.to(device=target_device, dtype=target_dtype) query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) query_states = query_states.view( batch_size, seq_len, self.num_heads, self.head_dim ).transpose(1, 2) key_states = key_states.view( batch_size, seq_len, self.num_key_value_heads, self.kv_head_dim ).transpose(1, 2) value_states = value_states.view( batch_size, seq_len, self.num_key_value_heads, self.kv_head_dim ).transpose(1, 2) key_states = self._repeat_kv(key_states, self.num_key_value_groups) value_states = self._repeat_kv(value_states, self.num_key_value_groups) retention_states = self._compute_retention( query_states, key_states, value_states ) retention_states = retention_states.transpose(1, 2).contiguous() retention_states = retention_states.reshape( batch_size, seq_len, self.q_dim ) if self.group_norm.weight.device != retention_states.device or self.group_norm.weight.dtype != retention_states.dtype: self.group_norm = self.group_norm.to(device=retention_states.device, dtype=retention_states.dtype) retention_states = self.group_norm( retention_states.transpose(1, 2) ).transpose(1, 2) retention_states = torch.clamp(retention_states, min=-10.0, max=10.0) attn_output = self.o_proj(retention_states) return (attn_output, None) def _compute_retention( self, queries: torch.Tensor, keys: torch.Tensor, values: torch.Tensor, ): """O(n) Retention computation""" batch_size, num_heads, seq_len, head_dim = queries.shape state = torch.zeros( batch_size, num_heads, head_dim, head_dim, dtype=queries.dtype, device=queries.device ) + 1e-6 outputs = [] decay = torch.sigmoid(self.decay).view(1, -1, 1, 1).to( device=queries.device, dtype=queries.dtype ) for t in range(seq_len): q_t = queries[:, :, t, :] k_t = keys[:, :, t, :] v_t = values[:, :, t, :] state = decay * state kv_update = torch.einsum('bhd,bhe->bhde', k_t, v_t) kv_update = torch.clamp(kv_update, min=-5.0, max=5.0) state = state + kv_update state = torch.clamp(state, min=-10.0, max=10.0) output_t = torch.einsum('bhd,bhde->bhe', q_t, state) outputs.append(output_t) output = torch.stack(outputs, dim=2) return output class HierarchicalRetention(nn.Module): """PHOENIX Hierarchical Retention""" def __init__(self, config, layer_idx=0): super().__init__() self.base_retention = MultiScaleRetention(config, layer_idx) hidden_size = config.hidden_size self.d_state = hidden_size // 2 self.short_proj = nn.Linear(hidden_size, self.d_state) self.medium_proj = nn.Linear(self.d_state, self.d_state) self.long_proj = nn.Linear(self.d_state, self.d_state * 2) self.fusion = nn.Linear(self.d_state * 4, hidden_size) self.short_decay = 0.5 self.medium_decay = 0.8 self.long_decay = 0.95 self.norm = nn.LayerNorm(hidden_size) def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.Tensor] = None, past_key_values: Optional[Tuple[torch.Tensor]] = None, **kwargs ): """Hierarchical forward pass""" batch_size, seq_len, hidden_size = hidden_states.shape target_device = hidden_states.device target_dtype = hidden_states.dtype if self.short_proj.weight.device != target_device or self.short_proj.weight.dtype != target_dtype: self.to(device=target_device, dtype=target_dtype) base_result = self.base_retention( hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache ) retention_output = base_result[0] short_state = torch.zeros(batch_size, self.d_state, dtype=target_dtype, device=target_device) medium_state = torch.zeros(batch_size, self.d_state, dtype=target_dtype, device=target_device) long_state = torch.zeros(batch_size, self.d_state * 2, dtype=target_dtype, device=target_device) hierarchical_outputs = [] for t in range(seq_len): x_t = retention_output[:, t, :] short_input = self.short_proj(x_t) short_state = self.short_decay * short_state + short_input if t % 8 == 0: medium_state = self.medium_decay * medium_state + \ self.medium_proj(short_state) if t % 64 == 0: long_state = self.long_decay * long_state + \ self.long_proj(medium_state) combined = torch.cat([short_state, medium_state, long_state], dim=-1) output_t = self.fusion(combined) hierarchical_outputs.append(output_t) output = torch.stack(hierarchical_outputs, dim=1) output = self.norm(output) return (output, None) # ===================================================== # λͺ¨λΈ λ³€ν™˜ ν•¨μˆ˜ # ===================================================== def replace_attention_with_retention(model, use_hierarchical=True, structure_info=None): """Transformer Attention β†’ PHOENIX Retention""" print("πŸ”„ Starting Attention β†’ Retention conversion...") replaced_count = 0 total_layers = 0 layers = None if structure_info and structure_info.get('layer_path'): layer_path = structure_info['layer_path'] if layer_path == 'model.layers': if hasattr(model, 'model') and hasattr(model.model, 'layers'): layers = model.model.layers elif layer_path == 'transformer.h': if hasattr(model, 'transformer') and hasattr(model.transformer, 'h'): layers = model.transformer.h if layers is None: possible_paths = [ ('model.layers', lambda m: m.model.layers if hasattr(m, 'model') and hasattr(m.model, 'layers') else None), ('transformer.h', lambda m: m.transformer.h if hasattr(m, 'transformer') and hasattr(m.transformer, 'h') else None), ] for path_name, path_fn in possible_paths: result = path_fn(model) if result is not None: layers = result break if layers is None: print("❌ Cannot find layers") return model, 0, 0 total_layers = len(layers) print(f" Found {total_layers} layers") if structure_info and structure_info.get('head_dim'): model.config.head_dim = structure_info['head_dim'] for layer_idx, layer in enumerate(layers): try: if hasattr(layer, 'self_attn'): old_attn = layer.self_attn if use_hierarchical: new_retention = HierarchicalRetention(model.config, layer_idx) else: new_retention = MultiScaleRetention(model.config, layer_idx) if hasattr(old_attn, 'q_proj'): try: target = new_retention.base_retention if use_hierarchical else new_retention target.q_proj.weight.data = old_attn.q_proj.weight.data.clone() target.k_proj.weight.data = old_attn.k_proj.weight.data.clone() target.v_proj.weight.data = old_attn.v_proj.weight.data.clone() target.o_proj.weight.data = old_attn.o_proj.weight.data.clone() except: pass layer.self_attn = new_retention replaced_count += 1 except Exception as e: continue print(f"\nβœ… Conversion complete: {replaced_count}/{total_layers} layers") return model, replaced_count, total_layers # ===================================================== # πŸ†• MULTI-GPU Fine-tuning νŒŒμ΄ν”„λΌμΈ # ===================================================== def finetune_retention_model( model, tokenizer, num_steps: int = 3000, batch_size: int = 4, learning_rate: float = 1e-5, output_dir: str = None, use_gradient_checkpointing: bool = True, ): """ πŸ†• v2.0: Brumby-style Retraining with Multi-GPU Support """ # output_dir κΈ°λ³Έκ°’ μ„€μ • if output_dir is None: output_dir = f"{STORAGE_PATH}/finetuning_temp" print("\n" + "="*80) print("πŸ”₯ PHOENIX RETRAINING - Multi-GPU (v2.0)") print("="*80) print(f" GPUs: {NUM_GPUS}") print(f" Target Steps: {num_steps}") print(f" Batch Size per GPU: {batch_size}") print(f" Global Batch Size: {batch_size * NUM_GPUS}") print(f" Learning Rate: {learning_rate}") print(f" Gradient Checkpointing: {use_gradient_checkpointing}") start_time = time.time() # βœ… Gradient Checkpointing (λ©”λͺ¨λ¦¬ μ ˆμ•½) if use_gradient_checkpointing: if hasattr(model, 'gradient_checkpointing_enable'): model.gradient_checkpointing_enable() print(f" βœ… Gradient Checkpointing enabled") # Dataset μ€€λΉ„ train_dataset = prepare_simple_dataset( tokenizer=tokenizer, num_steps=num_steps, batch_size=batch_size * NUM_GPUS # Multi-GPU κ³ λ € ) # βœ… Multi-GPU Training Arguments training_args = TrainingArguments( output_dir=output_dir, # πŸš€ Multi-GPU μ„€μ • per_device_train_batch_size=batch_size, # GPUλ‹Ή batch gradient_accumulation_steps=max(1, 8 // NUM_GPUS), # GPU μˆ˜μ— 따라 μ‘°μ • # Training μ„€μ • num_train_epochs=1, max_steps=num_steps, learning_rate=learning_rate, warmup_steps=100, # Optimization fp16=True, # Mixed precision optim="adamw_torch_fused", # H100 μ΅œμ ν™” # Logging logging_steps=50, logging_first_step=True, save_steps=1000, save_total_limit=2, # Performance dataloader_num_workers=4 * NUM_GPUS, # GPUλ‹Ή 4 workers dataloader_pin_memory=True, # Multi-GPU κ΄€λ ¨ ddp_find_unused_parameters=False, ddp_backend="nccl", # H100 μ΅œμ ν™” # Misc remove_unused_columns=False, report_to="none", # βœ… DeepSpeed (선택사항) # deepspeed="ds_config.json", # DeepSpeed μ‚¬μš©μ‹œ ) # Data collator data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) # βœ… Trainer (μžλ™ Multi-GPU) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, tokenizer=tokenizer, data_collator=data_collator, ) # Train! print(f"\nπŸš€ Starting Multi-GPU Fine-tuning...") print(f" Using {NUM_GPUS} GPUs") trainer.train() elapsed = time.time() - start_time print(f"\nβœ… Fine-tuning Complete!") print(f" Time: {elapsed/60:.1f} minutes") print(f" Effective samples/sec: {(num_steps * batch_size * NUM_GPUS) / elapsed:.2f}") print(f"="*80 + "\n") return model def prepare_simple_dataset( tokenizer, num_steps: int, batch_size: int, max_length: int = 2048, ): """Dataset μ€€λΉ„""" print(f"\nπŸ“Š Preparing Dataset...") num_samples = num_steps * batch_size print(f" Target samples: {num_samples}") try: dataset = load_dataset( "wikitext", "wikitext-2-raw-v1", split=f"train[:{num_samples}]" ) print(f" βœ… Loaded: {len(dataset)} samples") except Exception as e: print(f" ❌ Failed: {e}") raise def tokenize_function(examples): return tokenizer( examples['text'], truncation=True, max_length=max_length, padding="max_length", ) tokenized = dataset.map( tokenize_function, batched=True, remove_columns=dataset.column_names, num_proc=4 # Parallel processing ) print(f" βœ… Tokenized: {len(tokenized)} samples") return tokenized def estimate_finetuning_cost( model_size: str, num_steps: int, batch_size: int, num_gpus: int = NUM_GPUS, gpu_type: str = "H100", ) -> Dict: """λΉ„μš© 계산기 - Multi-GPU""" gpu_costs = { "H100": 3.0, "A100": 2.0, "A10G": 1.0, } model_step_times = { "0.6B": 0.5, "1.5B": 1.0, "3B": 2.0, "7B": 3.5, "14B": 6.0, } # Multi-GPU둜 μΈν•œ μ‹œκ°„ 단좕 (linear scaling κ°€μ •) step_time = model_step_times.get(model_size, 1.0) * (batch_size / 4) step_time_per_gpu = step_time / num_gpus # GPU 병렬화 total_seconds = num_steps * step_time_per_gpu total_hours = total_seconds / 3600 # λΉ„μš©μ€ GPU 수만큼 곱함 total_cost_usd = total_hours * gpu_costs.get(gpu_type, 2.0) * num_gpus return { 'hours': round(total_hours, 2), 'cost_usd': round(total_cost_usd, 2), 'cost_krw': round(total_cost_usd * 1300, 0), 'num_gpus': num_gpus, 'gpu_type': gpu_type, } # ===================================================== # Custom Modeling Code (동일) # ===================================================== def generate_modeling_phoenix_code(): """PHOENIX Custom Modeling Code v2.0""" return '''""" PHOENIX Retention Model v2.0 βœ… v2.0: Brumby-style Retraining support βœ… v1.4.3: forward() μ‹œκ·Έλ‹ˆμ²˜ Transformers ν˜Έν™˜ βœ… v1.4.3: dtype 뢈일치 μˆ˜μ • """ import torch import torch.nn as nn from typing import Optional, Tuple from transformers.modeling_utils import PreTrainedModel from transformers.configuration_utils import PretrainedConfig from transformers import AutoConfig, AutoModelForCausalLM import os class PhoenixConfig(PretrainedConfig): model_type = "phoenix" def __init__(self, use_phoenix_retention=True, phoenix_version="2.0", original_model=None, use_hierarchical=True, **kwargs): super().__init__(**kwargs) self.use_phoenix_retention = use_phoenix_retention self.phoenix_version = phoenix_version self.original_model = original_model self.use_hierarchical = use_hierarchical class MultiScaleRetention(nn.Module): def __init__(self, config, layer_idx=0): super().__init__() self.hidden_size = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = getattr(config, 'head_dim', self.hidden_size // self.num_heads) self.num_key_value_heads = getattr(config, 'num_key_value_heads', self.num_heads) self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.q_dim = self.num_heads * self.head_dim self.kv_dim = self.num_key_value_heads * self.head_dim self.q_proj = nn.Linear(self.hidden_size, self.q_dim, bias=False) self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False) self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False) self.o_proj = nn.Linear(self.q_dim, self.hidden_size, bias=False) self.decay = nn.Parameter(torch.linspace(0.95, 0.99, self.num_heads)) self.group_norm = nn.GroupNorm(self.num_heads, self.q_dim) def _repeat_kv(self, x, n): b, h, s, d = x.shape if n == 1: return x return x[:, :, None, :, :].expand(b, h, n, s, d).reshape(b, h*n, s, d) def forward(self, hidden_states, **kwargs): b, s, _ = hidden_states.shape device, dtype = hidden_states.device, hidden_states.dtype if self.q_proj.weight.device != device or self.q_proj.weight.dtype != dtype: self.to(device=device, dtype=dtype) q = self.q_proj(hidden_states).view(b, s, self.num_heads, self.head_dim).transpose(1, 2) k = self.k_proj(hidden_states).view(b, s, self.num_key_value_heads, self.head_dim).transpose(1, 2) v = self.v_proj(hidden_states).view(b, s, self.num_key_value_heads, self.head_dim).transpose(1, 2) k = self._repeat_kv(k, self.num_key_value_groups) v = self._repeat_kv(v, self.num_key_value_groups) out = self._retention(q, k, v) out = out.transpose(1, 2).reshape(b, s, self.q_dim) out = self.group_norm(out.transpose(1, 2)).transpose(1, 2) return (self.o_proj(torch.clamp(out, -10, 10)), None) def _retention(self, q, k, v): b, h, s, d = q.shape state = torch.zeros(b, h, d, d, dtype=q.dtype, device=q.device) + 1e-6 decay = torch.sigmoid(self.decay).view(1, -1, 1, 1).to(q) outs = [] for t in range(s): state = decay * state + torch.clamp(torch.einsum('bhd,bhe->bhde', k[:,:,t], v[:,:,t]), -5, 5) state = torch.clamp(state, -10, 10) outs.append(torch.einsum('bhd,bhde->bhe', q[:,:,t], state)) return torch.stack(outs, dim=2) class HierarchicalRetention(nn.Module): def __init__(self, config, layer_idx=0): super().__init__() self.base_retention = MultiScaleRetention(config, layer_idx) h = config.hidden_size self.d_state = h // 2 self.short_proj = nn.Linear(h, self.d_state) self.medium_proj = nn.Linear(self.d_state, self.d_state) self.long_proj = nn.Linear(self.d_state, self.d_state*2) self.fusion = nn.Linear(self.d_state*4, h) self.norm = nn.LayerNorm(h) self.decays = [0.5, 0.8, 0.95] def forward(self, hidden_states, **kwargs): b, s, h = hidden_states.shape device, dtype = hidden_states.device, hidden_states.dtype if self.short_proj.weight.device != device or self.short_proj.weight.dtype != dtype: self.to(device=device, dtype=dtype) ret_out = self.base_retention(hidden_states)[0] short = torch.zeros(b, self.d_state, dtype=dtype, device=device) med = torch.zeros(b, self.d_state, dtype=dtype, device=device) long = torch.zeros(b, self.d_state*2, dtype=dtype, device=device) outs = [] for t in range(s): short = self.decays[0]*short + self.short_proj(ret_out[:,t]) if t % 8 == 0: med = self.decays[1]*med + self.medium_proj(short) if t % 64 == 0: long = self.decays[2]*long + self.long_proj(med) outs.append(self.fusion(torch.cat([short, med, long], -1))) return (self.norm(torch.stack(outs, 1)), None) def replace_attention_with_retention_for_loading(model, use_hierarchical=True): layers = getattr(model, 'model', model) layers = getattr(layers, 'layers', getattr(layers, 'h', None)) if layers is None: return model, 0, 0 original_dtype = None for param in model.parameters(): original_dtype = param.dtype break cnt = 0 for i, layer in enumerate(layers): if hasattr(layer, 'self_attn'): new_ret = HierarchicalRetention(model.config, i) if use_hierarchical else MultiScaleRetention(model.config, i) if original_dtype: new_ret = new_ret.to(dtype=original_dtype) layer.self_attn = new_ret cnt += 1 return model, cnt, len(layers) class PhoenixPreTrainedModel(PreTrainedModel): config_class = PhoenixConfig base_model_prefix = "phoenix" class PhoenixModelForCausalLM(PhoenixPreTrainedModel): def __init__(self, config): super().__init__(config) self._model = None self._ready = False @classmethod def from_pretrained(cls, path, *args, **kwargs): print(f"πŸ”₯ PHOENIX v2.0 loading from {path}") config = AutoConfig.from_pretrained(path, trust_remote_code=True) orig = getattr(config, 'original_model', 'Qwen/Qwen3-0.6B') hier = getattr(config, 'use_hierarchical', True) try: base_cfg = AutoConfig.from_pretrained(orig, trust_remote_code=True) except: base_cfg = config model = AutoModelForCausalLM.from_config(base_cfg) model, conv, tot = replace_attention_with_retention_for_loading(model, hier) print(f" βœ… Converted {conv}/{tot} layers") sd = None if os.path.exists(path): for fname in ["model.safetensors", "pytorch_model.bin"]: fpath = os.path.join(path, fname) if os.path.exists(fpath): if fname.endswith('.safetensors'): from safetensors.torch import load_file sd = load_file(fpath) else: sd = torch.load(fpath, map_location='cpu') break else: from huggingface_hub import hf_hub_download for fname in ["model.safetensors", "pytorch_model.bin"]: try: fpath = hf_hub_download(path, fname) if fname.endswith('.safetensors'): from safetensors.torch import load_file sd = load_file(fpath) else: sd = torch.load(fpath, map_location='cpu') break except: pass if sd: miss, unex = model.load_state_dict(sd, strict=False) print(f" πŸ“¦ Weights: {len(miss)} missing, {len(unex)} unexpected") if 'lm_head.weight' in miss and getattr(config, 'tie_word_embeddings', False): if hasattr(model, 'lm_head') and hasattr(model.model, 'embed_tokens'): model.lm_head.weight = model.model.embed_tokens.weight print(f" πŸ”— Tied embeddings") inst = cls(config) inst._model = model inst._ready = True print(f"βœ… PHOENIX v2.0 ready!") return inst def forward(self, *a, **k): if not self._ready: raise ValueError("Not initialized") return self._model(*a, **k) def generate(self, *a, **k): if not self._ready: raise ValueError("Not initialized") return self._model.generate(*a, **k) AutoConfig.register("phoenix", PhoenixConfig) ''' # ===================================================== # μ €μž₯/μ—…λ‘œλ“œ/평가 (동일) # ===================================================== def save_phoenix_model_with_code(model, tokenizer, output_path, original_model_url, metadata): """PHOENIX λͺ¨λΈ μ €μž₯""" output_path = Path(output_path) output_path.mkdir(parents=True, exist_ok=True) print(f"\nπŸ’Ύ Saving PHOENIX model...") if hasattr(model.config, 'tie_word_embeddings') and model.config.tie_word_embeddings: if hasattr(model, 'lm_head') and hasattr(model, 'model') and hasattr(model.model, 'embed_tokens'): model.lm_head.weight = model.model.embed_tokens.weight model.save_pretrained(output_path) tokenizer.save_pretrained(output_path) modeling_code = generate_modeling_phoenix_code() with open(output_path / "modeling_phoenix.py", "w") as f: f.write(modeling_code) config_path = output_path / "config.json" if config_path.exists(): with open(config_path, "r") as f: config_dict = json.load(f) config_dict["use_phoenix_retention"] = True config_dict["phoenix_version"] = "2.0" config_dict["original_model"] = original_model_url config_dict["auto_map"] = { "AutoModelForCausalLM": "modeling_phoenix.PhoenixModelForCausalLM", } with open(config_path, "w") as f: json.dump(config_dict, f, indent=2) with open(output_path / 'phoenix_metadata.json', 'w') as f: json.dump(metadata, f, indent=2) readme = f"""# πŸ”₯ PHOENIX v2.0 - {original_model_url} **Multi-GPU Trained** with {metadata.get('num_gpus', 1)} GPUs ## Features - βœ… Brumby-style Retraining - βœ… O(n) Complexity - βœ… GQA Support ## Usage ```python from transformers import AutoModelForCausalLM, AutoTokenizer model = AutoModelForCausalLM.from_pretrained( "{output_path.name}", trust_remote_code=True, torch_dtype="auto", device_map="auto" ) ``` **VIDraft AI Research Lab** | PHOENIX v2.0 Multi-GPU """ with open(output_path / "README.md", "w") as f: f.write(readme) print(f" βœ… Model saved") def upload_to_huggingface_hub( model_path: str, original_model_url: str, repo_name: str = None, private: bool = True, token: str = None, ) -> Tuple[bool, str, str]: """Upload to Hub""" if token is None: token = HF_TOKEN if not token: return False, "", "❌ No HF_TOKEN" try: api = HfApi(token=token) user_info = api.whoami(token=token) username = user_info['name'] if not repo_name: base_name = original_model_url.split('/')[-1] repo_name = f"phoenix-{base_name}" repo_id = f"{username}/{repo_name}" create_repo( repo_id=repo_id, token=token, private=private, repo_type="model", exist_ok=True ) api.upload_folder( folder_path=str(model_path), repo_id=repo_id, repo_type="model", token=token, ) hub_url = f"https://huggingface.co/{repo_id}" return True, hub_url, f"βœ… Uploaded to {hub_url}" except Exception as e: return False, "", f"❌ Upload failed: {e}" def evaluate_model_quality(model, tokenizer): """Quality 평가""" test_prompts = [ "The capital of France is", "In machine learning,", "2 + 2 =", ] model.eval() scores = [] with torch.no_grad(): for prompt in test_prompts: try: inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate( **inputs, max_new_tokens=20, do_sample=False, pad_token_id=tokenizer.eos_token_id, ) generated = tokenizer.decode(outputs[0], skip_special_tokens=True) score = 0.0 if len(generated) > len(prompt): score += 0.3 if not any(c in generated[len(prompt):] for c in ['οΏ½', '[UNK]']): score += 0.3 if len(generated.split()) > len(prompt.split()) + 2: score += 0.4 scores.append(score) except: scores.append(0.0) return sum(scores) / len(scores) if scores else 0.0 # ===================================================== # πŸ†• Multi-GPU Burning ν•¨μˆ˜ # ===================================================== def burn_model_with_finetuning( model_url: str, output_dir: str, use_hierarchical: bool = True, enable_finetuning: bool = False, num_steps: int = 3000, batch_size: int = 4, learning_rate: float = 1e-5, use_gradient_checkpointing: bool = True, ): """πŸ†• v2.0: Multi-GPU Optimized Burning""" print("="*80) print(f"πŸ”₯ PHOENIX Model Burning v2.0 - Multi-GPU ({NUM_GPUS} GPUs)") print("="*80) output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) try: # STEP 1: Structure Analysis print(f"\nπŸ” STEP 1: Structure Analysis...") structure_info = analyze_model_structure(model_url) # STEP 2: Load Model with device_map="auto" print(f"\nπŸ“₯ STEP 2: Loading model (Multi-GPU)...") start_time = time.time() config = AutoConfig.from_pretrained(model_url, trust_remote_code=True) # βœ… Multi-GPU: device_map="auto"둜 μžλ™ λΆ„μ‚° model = AutoModelForCausalLM.from_pretrained( model_url, trust_remote_code=True, torch_dtype=torch.float16, device_map="auto" # μžλ™μœΌλ‘œ 8개 GPU에 λΆ„μ‚°! ) tokenizer = AutoTokenizer.from_pretrained(model_url, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token load_time = time.time() - start_time print(f"βœ… Loaded across {NUM_GPUS} GPUs in {load_time:.1f}s") # STEP 3: Convert print(f"\nπŸ”„ STEP 3: Converting Attention β†’ Retention...") convert_start = time.time() model, converted, total = replace_attention_with_retention( model, use_hierarchical=use_hierarchical, structure_info=structure_info ) convert_time = time.time() - convert_start conversion_rate = converted / total if total > 0 else 0 print(f"βœ… Converted {converted}/{total} layers in {convert_time:.1f}s") # STEP 4: Fine-tuning (Multi-GPU) if enable_finetuning: print(f"\nπŸš€ STEP 4: Multi-GPU Fine-tuning...") ft_start = time.time() model = finetune_retention_model( model=model, tokenizer=tokenizer, num_steps=num_steps, batch_size=batch_size, learning_rate=learning_rate, use_gradient_checkpointing=use_gradient_checkpointing, ) ft_time = time.time() - ft_start print(f"βœ… Fine-tuning completed in {ft_time/60:.1f} minutes") else: ft_time = 0 print(f"\n⏭️ STEP 4: Fine-tuning skipped") # STEP 5: Evaluate print(f"\nπŸ“Š STEP 5: Evaluating...") quality_score = evaluate_model_quality(model, tokenizer) print(f"βœ… Quality: {quality_score:.2f}/1.00") # STEP 6: Save print(f"\nπŸ’Ύ STEP 6: Saving...") metadata = { 'phoenix_version': '2.0', 'original_model': model_url, 'use_hierarchical': use_hierarchical, 'conversion_rate': conversion_rate, 'quality_score': quality_score, 'finetuned': enable_finetuning, 'finetuning_steps': num_steps if enable_finetuning else 0, 'num_gpus': NUM_GPUS, 'gradient_checkpointing': use_gradient_checkpointing, 'timestamp': datetime.now().isoformat(), } save_phoenix_model_with_code(model, tokenizer, output_path, model_url, metadata) total_time = time.time() - start_time result = { 'status': 'success', 'model_path': str(output_path), 'conversion_rate': conversion_rate, 'quality_score': quality_score, 'total_time': total_time, 'finetuned': enable_finetuning, 'num_gpus': NUM_GPUS, 'structure_info': structure_info, } print(f"\n{'='*80}") print(f"βœ… Multi-GPU Burning Complete!") print(f" GPUs Used: {NUM_GPUS}") print(f" Model: {output_path}") print(f" Quality: {quality_score:.2f}/1.00") print(f"{'='*80}\n") return result except Exception as e: import traceback return { 'status': 'failed', 'error': str(e), 'traceback': traceback.format_exc() } # ===================================================== # Database (동일) # ===================================================== class ExperimentDatabase: def __init__(self, db_path: str): self.db_path = db_path self.init_database() def init_database(self): with sqlite3.connect(self.db_path) as conn: cursor = conn.cursor() cursor.execute(""" CREATE TABLE IF NOT EXISTS burning_history ( id INTEGER PRIMARY KEY AUTOINCREMENT, model_url TEXT, output_path TEXT, hub_url TEXT, conversion_rate REAL, quality_score REAL, finetuned BOOLEAN, num_gpus INTEGER, timestamp DATETIME DEFAULT CURRENT_TIMESTAMP ) """) conn.commit() def save_burning(self, info: Dict) -> int: with sqlite3.connect(self.db_path) as conn: cursor = conn.cursor() cursor.execute(""" INSERT INTO burning_history (model_url, output_path, hub_url, conversion_rate, quality_score, finetuned, num_gpus) VALUES (?, ?, ?, ?, ?, ?, ?) """, ( info.get('model_url'), info.get('output_path'), info.get('hub_url'), info.get('conversion_rate'), info.get('quality_score'), info.get('finetuned'), info.get('num_gpus', 1), )) conn.commit() return cursor.lastrowid def get_history(self, limit: int = 20) -> List[Dict]: with sqlite3.connect(self.db_path) as conn: conn.row_factory = sqlite3.Row cursor = conn.cursor() cursor.execute("SELECT * FROM burning_history ORDER BY timestamp DESC LIMIT ?", (limit,)) return [dict(row) for row in cursor.fetchall()] db = ExperimentDatabase(DB_PATH) # ===================================================== # Gradio UI # ===================================================== def burn_phoenix_model_ui( model_url, use_hierarchical, output_name, enable_finetuning, ft_steps, ft_batch, ft_lr, use_grad_ckpt, upload_hub, hub_repo, hub_private, ): """Gradio UI""" try: if not model_url.strip(): return "⚠️ Model URL required", None if not output_name.strip(): output_name = f"phoenix_{model_url.split('/')[-1]}_{int(time.time())}" output_dir = f"{MODELS_PATH}/{output_name}" # λΉ„μš© μΆ”μ • if enable_finetuning: model_size = "0.6B" if "0.6B" in model_url else "1.5B" cost = estimate_finetuning_cost(model_size, ft_steps, ft_batch, NUM_GPUS) print(f"\nπŸ’° Estimated Cost: ${cost['cost_usd']} ({cost['hours']}h with {NUM_GPUS} GPUs)") # Burn result = burn_model_with_finetuning( model_url=model_url, output_dir=output_dir, use_hierarchical=use_hierarchical, enable_finetuning=enable_finetuning, num_steps=ft_steps, batch_size=ft_batch, learning_rate=ft_lr, use_gradient_checkpointing=use_grad_ckpt, ) if result['status'] != 'success': return f"❌ Failed\n```\n{result.get('error')}\n```", None # Upload hub_url = None if upload_hub and HF_TOKEN: success, hub_url, msg = upload_to_huggingface_hub( model_path=result['model_path'], original_model_url=model_url, repo_name=hub_repo if hub_repo.strip() else None, private=hub_private, ) # DB db.save_burning({ 'model_url': model_url, 'output_path': result['model_path'], 'hub_url': hub_url, 'conversion_rate': result['conversion_rate'], 'quality_score': result['quality_score'], 'finetuned': enable_finetuning, 'num_gpus': NUM_GPUS, }) # Output output_md = f""" # πŸ”₯ PHOENIX v2.0 Multi-GPU Complete! ## Hardware - **GPUs Used**: {NUM_GPUS} x {torch.cuda.get_device_name(0) if NUM_GPUS > 0 else 'N/A'} ## Model Info - **Original**: {model_url} - **Output**: `{result['model_path']}` - **Conversion**: {result['conversion_rate']*100:.1f}% - **Quality**: {result['quality_score']:.2f}/1.00 - **Fine-tuned**: {'βœ… YES' if enable_finetuning else '❌ NO'} """ if hub_url: output_md += f""" ## Hub Status βœ… **Uploaded**: [{hub_url}]({hub_url}) ```python model = AutoModelForCausalLM.from_pretrained( "{hub_url.replace('https://huggingface.co/', '')}", trust_remote_code=True, device_map="auto" # Multi-GPU ) ``` """ # Plot fig = go.Figure() fig.add_trace(go.Bar( x=['Conversion', 'Quality'], y=[result['conversion_rate'], result['quality_score']], marker_color=['#3b82f6', '#10b981'] )) fig.update_layout(title=f"Metrics ({NUM_GPUS} GPUs)", yaxis_range=[0, 1]) return output_md, fig except Exception as e: import traceback return f"❌ Error:\n```\n{traceback.format_exc()}\n```", None def view_history(): """History""" try: history = db.get_history(20) if not history: return "πŸ“­ No history", None df = pd.DataFrame(history) fig = px.scatter( df, x='timestamp', y='quality_score', color='finetuned', size='num_gpus', title='Burning History (Multi-GPU)' ) return f"## History\n\n{df.to_markdown(index=False)}", fig except Exception as e: return f"❌ Error: {e}", None # ===================================================== # Gradio App # ===================================================== with gr.Blocks(title="πŸ”₯ PHOENIX v2.0 Multi-GPU", theme=gr.themes.Soft()) as demo: gr.Markdown(f""" # πŸ”₯ PHOENIX v2.0 - Multi-GPU Optimized **H100 x {NUM_GPUS} GPUs Ready** πŸ†• **v2.0 Multi-GPU**: Accelerate 톡합, DDP 지원 πŸ†• **v2.0**: Fine-tuning νŒŒμ΄ν”„λΌμΈ (Brumby-style) βœ… v1.4.3: All fixes included βœ… GQA Support | O(n) Complexity --- """) with gr.Tabs(): with gr.Tab("πŸ”₯ Model Burning"): with gr.Row(): with gr.Column(scale=1): burn_url = gr.Textbox( label="πŸ”— Model URL", value=DEFAULT_MODEL, placeholder="Qwen/Qwen3-0.6B" ) burn_hier = gr.Checkbox(value=True, label="Hierarchical Retention") burn_name = gr.Textbox(label="πŸ’Ύ Output Name", placeholder="my_model") gr.Markdown("---") gr.Markdown(f"### πŸ†• Fine-tuning ({NUM_GPUS} GPUs)") burn_ft_enable = gr.Checkbox( value=False, label="πŸš€ Enable Fine-tuning (Brumby-style)", info=f"Multi-GPU acceleration with {NUM_GPUS} GPUs!" ) burn_ft_steps = gr.Slider( 1000, 10000, 3000, step=100, label="Steps", visible=False ) burn_ft_batch = gr.Slider( 1, 16, 4, step=1, label=f"Batch Size per GPU ({NUM_GPUS} GPUs)", visible=False ) burn_ft_lr = gr.Number(value=1e-5, label="Learning Rate", visible=False) burn_grad_ckpt = gr.Checkbox( value=True, label="βœ… Gradient Checkpointing (saves memory)", visible=False ) def toggle_ft(enabled): return [ gr.update(visible=enabled), gr.update(visible=enabled), gr.update(visible=enabled), gr.update(visible=enabled), ] burn_ft_enable.change( toggle_ft, [burn_ft_enable], [burn_ft_steps, burn_ft_batch, burn_ft_lr, burn_grad_ckpt] ) gr.Markdown("---") gr.Markdown("### 🌐 Hub Upload") burn_upload = gr.Checkbox(value=True, label="πŸ“€ Upload to Hub") burn_repo = gr.Textbox(label="πŸ“¦ Repo Name (optional)") burn_private = gr.Checkbox(value=True, label="πŸ”’ Private") burn_btn = gr.Button("πŸ”₯ Burn Model", variant="primary", size="lg") with gr.Column(scale=2): burn_output = gr.Markdown() burn_plot = gr.Plot() burn_btn.click( burn_phoenix_model_ui, [ burn_url, burn_hier, burn_name, burn_ft_enable, burn_ft_steps, burn_ft_batch, burn_ft_lr, burn_grad_ckpt, burn_upload, burn_repo, burn_private ], [burn_output, burn_plot] ) with gr.Tab("πŸ“Š History"): with gr.Row(): with gr.Column(scale=1): hist_btn = gr.Button("πŸ“Š Load", variant="primary") with gr.Column(scale=2): hist_out = gr.Markdown() hist_plot = gr.Plot() hist_btn.click(view_history, outputs=[hist_out, hist_plot]) gr.Markdown(f""" --- ## πŸ”₯ PHOENIX v2.0 Multi-GPU **Hardware**: {NUM_GPUS} x {torch.cuda.get_device_name(0) if NUM_GPUS > 0 else 'N/A'} **Features**: - πŸ†• Multi-GPU Training (DDP) - πŸ†• Gradient Checkpointing - πŸ†• H100 Optimized (fused optimizer) - πŸ†• Brumby-style Fine-tuning - βœ… All v1.4.3 Fixes **Token**: {'βœ…' if HF_TOKEN else '❌ Not Found'} **VIDraft AI Research Lab** | PHOENIX v2.0 Multi-GPU """) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='PHOENIX v2.0 Multi-GPU') parser.add_argument('--port', type=int, default=None, help='Server port (default: auto find 7860-7960)') parser.add_argument('--share', action='store_true', help='Create public Gradio link') parser.add_argument('--host', type=str, default="0.0.0.0", help='Server host') args = parser.parse_args() demo.queue(max_size=20) # 포트 μžλ™ μ°ΎκΈ° if args.port is None: # 7860λΆ€ν„° 7960κΉŒμ§€ μ‹œλ„ for port in range(7860, 7960): try: demo.launch( server_name=args.host, server_port=port, share=args.share, show_error=True ) break except OSError: continue else: demo.launch( server_name=args.host, server_port=args.port, share=args.share, show_error=True )