Training in progress, step 150

Browse files

Files changed (3) hide show

.ipynb_checkpoints/Untitled-checkpoint.ipynb +6 -0
Untitled.ipynb +296 -0
adapter_model.safetensors +1 -1

.ipynb_checkpoints/Untitled-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,296 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "bab68ed2-639c-4494-a0db-8a09169ba276",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🚀 H200 GPU Monitor Ready!\n",
+      "\n",
+      "Usage:\n",
+      "quick_check()           # One-time status check\n",
+      "monitor_gpu(30, 2)      # Monitor for 30 minutes, refresh every 2 seconds\n",
+      "monitor_gpu(60)         # Monitor for 1 hour with default settings\n"
+     ]
+    }
+   ],
+   "source": [
+    "import subprocess\n",
+    "import time\n",
+    "import json\n",
+    "import pandas as pd\n",
+    "from IPython.display import display, clear_output\n",
+    "import matplotlib.pyplot as plt\n",
+    "from datetime import datetime\n",
+    "\n",
+    "def get_gpu_stats():\n",
+    "    \"\"\"Get comprehensive GPU statistics\"\"\"\n",
+    "    try:\n",
+    "        # Get GPU stats using nvidia-smi\n",
+    "        result = subprocess.run([\n",
+    "            'nvidia-smi', '--query-gpu=index,name,utilization.gpu,utilization.memory,memory.used,memory.total,temperature.gpu,power.draw,power.limit',\n",
+    "            '--format=csv,noheader,nounits'\n",
+    "        ], capture_output=True, text=True, check=True)\n",
+    "        \n",
+    "        lines = result.stdout.strip().split('\\n')\n",
+    "        gpu_data = []\n",
+    "        \n",
+    "        for line in lines:\n",
+    "            parts = [part.strip() for part in line.split(',')]\n",
+    "            if len(parts) >= 9:\n",
+    "                gpu_data.append({\n",
+    "                    'GPU': int(parts[0]),\n",
+    "                    'Name': parts[1],\n",
+    "                    'GPU_Util_%': float(parts[2]) if parts[2] != '[Not Supported]' else 0,\n",
+    "                    'Mem_Util_%': float(parts[3]) if parts[3] != '[Not Supported]' else 0,\n",
+    "                    'Mem_Used_MB': float(parts[4]),\n",
+    "                    'Mem_Total_MB': float(parts[5]),\n",
+    "                    'Temp_C': float(parts[6]) if parts[6] != '[Not Supported]' else 0,\n",
+    "                    'Power_W': float(parts[7]) if parts[7] != '[Not Supported]' else 0,\n",
+    "                    'Power_Limit_W': float(parts[8]) if parts[8] != '[Not Supported]' else 0\n",
+    "                })\n",
+    "        \n",
+    "        return gpu_data\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error getting GPU stats: {e}\")\n",
+    "        return []\n",
+    "\n",
+    "def monitor_gpu(duration_minutes=60, refresh_seconds=2):\n",
+    "    \"\"\"Monitor GPU utilization in real-time\"\"\"\n",
+    "    \n",
+    "    print(\"🚀 GPU Utilization Monitor - H200 Training Analysis\")\n",
+    "    print(\"=\" * 70)\n",
+    "    \n",
+    "    start_time = time.time()\n",
+    "    end_time = start_time + (duration_minutes * 60)\n",
+    "    \n",
+    "    # Store history for plotting\n",
+    "    history = []\n",
+    "    \n",
+    "    try:\n",
+    "        while time.time() < end_time:\n",
+    "            clear_output(wait=True)\n",
+    "            \n",
+    "            # Get current stats\n",
+    "            gpu_stats = get_gpu_stats()\n",
+    "            timestamp = datetime.now()\n",
+    "            \n",
+    "            if gpu_stats:\n",
+    "                # Add to history\n",
+    "                for gpu in gpu_stats:\n",
+    "                    gpu['timestamp'] = timestamp\n",
+    "                    history.append(gpu.copy())\n",
+    "                \n",
+    "                # Display current stats\n",
+    "                print(\"🚀 GPU Utilization Monitor - H200 Training Analysis\")\n",
+    "                print(\"=\" * 70)\n",
+    "                print(f\"⏰ Monitoring Time: {(time.time() - start_time)/60:.1f}/{duration_minutes} minutes\")\n",
+    "                print(f\"🕐 Current Time: {timestamp.strftime('%H:%M:%S')}\")\n",
+    "                print()\n",
+    "                \n",
+    "                for gpu in gpu_stats:\n",
+    "                    name = gpu['Name']\n",
+    "                    gpu_util = gpu['GPU_Util_%']\n",
+    "                    mem_used = gpu['Mem_Used_MB'] / 1024  # Convert to GB\n",
+    "                    mem_total = gpu['Mem_Total_MB'] / 1024\n",
+    "                    mem_percent = (mem_used / mem_total) * 100\n",
+    "                    temp = gpu['Temp_C']\n",
+    "                    power = gpu['Power_W']\n",
+    "                    power_limit = gpu['Power_Limit_W']\n",
+    "                    power_percent = (power / power_limit) * 100 if power_limit > 0 else 0\n",
+    "                    \n",
+    "                    print(f\"🔥 GPU {gpu['GPU']}: {name}\")\n",
+    "                    print(f\"   💻 Compute Utilization: {gpu_util:6.1f}% {'🟢' if gpu_util > 90 else '🟡' if gpu_util > 70 else '🔴'}\")\n",
+    "                    print(f\"   🧠 Memory: {mem_used:6.1f}GB / {mem_total:6.1f}GB ({mem_percent:5.1f}%) {'🟢' if mem_percent > 95 else '🟡' if mem_percent > 80 else '🔴'}\")\n",
+    "                    print(f\"   🌡️  Temperature: {temp:5.1f}°C {'🟢' if temp < 80 else '🟡' if temp < 90 else '🔴'}\")\n",
+    "                    print(f\"   ⚡ Power: {power:6.1f}W / {power_limit:6.1f}W ({power_percent:5.1f}%) {'🟢' if power_percent > 80 else '🟡' if power_percent > 60 else '🔴'}\")\n",
+    "                    print()\n",
+    "                \n",
+    "                # Analysis\n",
+    "                avg_gpu_util = sum(gpu['GPU_Util_%'] for gpu in gpu_stats) / len(gpu_stats)\n",
+    "                avg_mem_util = sum((gpu['Mem_Used_MB']/gpu['Mem_Total_MB'])*100 for gpu in gpu_stats) / len(gpu_stats)\n",
+    "                \n",
+    "                print(\"📊 UTILIZATION ANALYSIS:\")\n",
+    "                if avg_gpu_util > 90 and avg_mem_util > 95:\n",
+    "                    print(\"   ✅ EXCELLENT: GPU is fully utilized!\")\n",
+    "                elif avg_gpu_util > 70 and avg_mem_util > 80:\n",
+    "                    print(\"   ⚠️  GOOD: GPU is well utilized but could be optimized\")\n",
+    "                else:\n",
+    "                    print(\"   ❌ UNDERUTILIZED: GPU has significant unused capacity\")\n",
+    "                \n",
+    "                print(f\"   📈 Average GPU Compute: {avg_gpu_util:.1f}%\")\n",
+    "                print(f\"   💾 Average Memory Usage: {avg_mem_util:.1f}%\")\n",
+    "                \n",
+    "                # Training efficiency indicators\n",
+    "                if avg_gpu_util < 70:\n",
+    "                    print(\"\\n🔧 OPTIMIZATION SUGGESTIONS:\")\n",
+    "                    print(\"   • Increase batch size\")\n",
+    "                    print(\"   • Reduce gradient accumulation steps\")\n",
+    "                    print(\"   • Check if CPU is bottlenecking data loading\")\n",
+    "                    print(\"   • Increase dataloader workers\")\n",
+    "                \n",
+    "                # Keep only last 100 data points for plotting\n",
+    "                if len(history) > 100:\n",
+    "                    history = history[-100:]\n",
+    "                    \n",
+    "            else:\n",
+    "                print(\"❌ Could not retrieve GPU statistics\")\n",
+    "            \n",
+    "            time.sleep(refresh_seconds)\n",
+    "            \n",
+    "    except KeyboardInterrupt:\n",
+    "        print(\"\\n⏹️  Monitoring stopped by user\")\n",
+    "    \n",
+    "    # Plot summary if we have history\n",
+    "    if len(history) > 5:\n",
+    "        plot_utilization_summary(history)\n",
+    "\n",
+    "def plot_utilization_summary(history):\n",
+    "    \"\"\"Plot utilization summary\"\"\"\n",
+    "    clear_output(wait=True)\n",
+    "    \n",
+    "    df = pd.DataFrame(history)\n",
+    "    \n",
+    "    if not df.empty:\n",
+    "        fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n",
+    "        fig.suptitle('H200 GPU Utilization Summary', fontsize=16)\n",
+    "        \n",
+    "        # GPU Utilization\n",
+    "        axes[0,0].plot(df['timestamp'], df['GPU_Util_%'], 'b-', linewidth=2)\n",
+    "        axes[0,0].set_title('GPU Compute Utilization (%)')\n",
+    "        axes[0,0].set_ylabel('Utilization %')\n",
+    "        axes[0,0].grid(True, alpha=0.3)\n",
+    "        axes[0,0].axhline(y=90, color='g', linestyle='--', alpha=0.7, label='Target >90%')\n",
+    "        axes[0,0].legend()\n",
+    "        \n",
+    "        # Memory Utilization\n",
+    "        mem_percent = (df['Mem_Used_MB'] / df['Mem_Total_MB']) * 100\n",
+    "        axes[0,1].plot(df['timestamp'], mem_percent, 'r-', linewidth=2)\n",
+    "        axes[0,1].set_title('Memory Utilization (%)')\n",
+    "        axes[0,1].set_ylabel('Memory %')\n",
+    "        axes[0,1].grid(True, alpha=0.3)\n",
+    "        axes[0,1].axhline(y=95, color='g', linestyle='--', alpha=0.7, label='Target >95%')\n",
+    "        axes[0,1].legend()\n",
+    "        \n",
+    "        # Temperature\n",
+    "        axes[1,0].plot(df['timestamp'], df['Temp_C'], 'orange', linewidth=2)\n",
+    "        axes[1,0].set_title('Temperature (°C)')\n",
+    "        axes[1,0].set_ylabel('Temperature °C')\n",
+    "        axes[1,0].grid(True, alpha=0.3)\n",
+    "        axes[1,0].axhline(y=80, color='r', linestyle='--', alpha=0.7, label='Caution >80°C')\n",
+    "        axes[1,0].legend()\n",
+    "        \n",
+    "        # Power Usage\n",
+    "        power_percent = (df['Power_W'] / df['Power_Limit_W']) * 100\n",
+    "        axes[1,1].plot(df['timestamp'], power_percent, 'purple', linewidth=2)\n",
+    "        axes[1,1].set_title('Power Usage (%)')\n",
+    "        axes[1,1].set_ylabel('Power %')\n",
+    "        axes[1,1].grid(True, alpha=0.3)\n",
+    "        axes[1,1].axhline(y=80, color='g', linestyle='--', alpha=0.7, label='Target >80%')\n",
+    "        axes[1,1].legend()\n",
+    "        \n",
+    "        plt.tight_layout()\n",
+    "        plt.show()\n",
+    "        \n",
+    "        # Print summary statistics\n",
+    "        print(\"\\n📊 TRAINING SESSION SUMMARY:\")\n",
+    "        print(\"=\" * 50)\n",
+    "        print(f\"Average GPU Utilization: {df['GPU_Util_%'].mean():.1f}% (Target: >90%)\")\n",
+    "        print(f\"Average Memory Usage: {mem_percent.mean():.1f}% (Target: >95%)\")\n",
+    "        print(f\"Average Temperature: {df['Temp_C'].mean():.1f}°C (Safe: <80°C)\")\n",
+    "        print(f\"Average Power Usage: {power_percent.mean():.1f}% (Target: >80%)\")\n",
+    "        print(f\"Max Memory Used: {df['Mem_Used_MB'].max()/1024:.1f}GB\")\n",
+    "\n",
+    "# Quick one-time check function\n",
+    "def quick_check():\n",
+    "    \"\"\"Quick one-time GPU status check\"\"\"\n",
+    "    gpu_stats = get_gpu_stats()\n",
+    "    \n",
+    "    if gpu_stats:\n",
+    "        for gpu in gpu_stats:\n",
+    "            mem_used_gb = gpu['Mem_Used_MB'] / 1024\n",
+    "            mem_total_gb = gpu['Mem_Total_MB'] / 1024\n",
+    "            mem_percent = (mem_used_gb / mem_total_gb) * 100\n",
+    "            \n",
+    "            print(f\"🔥 GPU {gpu['GPU']}: {gpu['Name']}\")\n",
+    "            print(f\"   💻 Compute: {gpu['GPU_Util_%']:.1f}%\")\n",
+    "            print(f\"   🧠 Memory: {mem_used_gb:.1f}GB/{mem_total_gb:.1f}GB ({mem_percent:.1f}%)\")\n",
+    "            print(f\"   🌡️  Temp: {gpu['Temp_C']:.1f}°C\")\n",
+    "            print(f\"   ⚡ Power: {gpu['Power_W']:.1f}W\")\n",
+    "\n",
+    "# Usage examples:\n",
+    "print(\"🚀 H200 GPU Monitor Ready!\")\n",
+    "print(\"\\nUsage:\")\n",
+    "print(\"quick_check()           # One-time status check\")\n",
+    "print(\"monitor_gpu(30, 2)      # Monitor for 30 minutes, refresh every 2 seconds\")\n",
+    "print(\"monitor_gpu(60)         # Monitor for 1 hour with default settings\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed5c7a88-eddf-4c8d-afac-2bda4bc3dcb9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🚀 GPU Utilization Monitor - H200 Training Analysis\n",
+      "======================================================================\n",
+      "⏰ Monitoring Time: 1.6/30 minutes\n",
+      "🕐 Current Time: 22:44:08\n",
+      "\n",
+      "🔥 GPU 0: NVIDIA H200 NVL\n",
+      "   💻 Compute Utilization:  100.0% 🟢\n",
+      "   🧠 Memory:  138.7GB /  140.4GB ( 98.8%) 🟢\n",
+      "   🌡️  Temperature:  83.0°C 🟡\n",
+      "   ⚡ Power:  540.7W /  600.0W ( 90.1%) 🟢\n",
+      "\n",
+      "📊 UTILIZATION ANALYSIS:\n",
+      "   ✅ EXCELLENT: GPU is fully utilized!\n",
+      "   📈 Average GPU Compute: 100.0%\n",
+      "   💾 Average Memory Usage: 98.8%\n"
+     ]
+    }
+   ],
+   "source": [
+    "monitor_gpu(30, 2) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "910a1785-c220-4bbc-810c-8df4ca8e9931",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bbc192d0f96870619f0cf8660469dcb3e58db1859ebc98ebe73bb2ce4f8bca14
 size 960760928

 version https://git-lfs.github.com/spec/v1
+oid sha256:4ec865c41f048ddedd7437b869d7037cf8898242aaff867df014f34b8c3ddc81
 size 960760928