ykarout commited on
Commit
65acd75
Β·
verified Β·
1 Parent(s): 99ce0c8

Training in progress, step 150

Browse files
.ipynb_checkpoints/Untitled-checkpoint.ipynb ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [],
3
+ "metadata": {},
4
+ "nbformat": 4,
5
+ "nbformat_minor": 5
6
+ }
Untitled.ipynb ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "bab68ed2-639c-4494-a0db-8a09169ba276",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "πŸš€ H200 GPU Monitor Ready!\n",
14
+ "\n",
15
+ "Usage:\n",
16
+ "quick_check() # One-time status check\n",
17
+ "monitor_gpu(30, 2) # Monitor for 30 minutes, refresh every 2 seconds\n",
18
+ "monitor_gpu(60) # Monitor for 1 hour with default settings\n"
19
+ ]
20
+ }
21
+ ],
22
+ "source": [
23
+ "import subprocess\n",
24
+ "import time\n",
25
+ "import json\n",
26
+ "import pandas as pd\n",
27
+ "from IPython.display import display, clear_output\n",
28
+ "import matplotlib.pyplot as plt\n",
29
+ "from datetime import datetime\n",
30
+ "\n",
31
+ "def get_gpu_stats():\n",
32
+ " \"\"\"Get comprehensive GPU statistics\"\"\"\n",
33
+ " try:\n",
34
+ " # Get GPU stats using nvidia-smi\n",
35
+ " result = subprocess.run([\n",
36
+ " 'nvidia-smi', '--query-gpu=index,name,utilization.gpu,utilization.memory,memory.used,memory.total,temperature.gpu,power.draw,power.limit',\n",
37
+ " '--format=csv,noheader,nounits'\n",
38
+ " ], capture_output=True, text=True, check=True)\n",
39
+ " \n",
40
+ " lines = result.stdout.strip().split('\\n')\n",
41
+ " gpu_data = []\n",
42
+ " \n",
43
+ " for line in lines:\n",
44
+ " parts = [part.strip() for part in line.split(',')]\n",
45
+ " if len(parts) >= 9:\n",
46
+ " gpu_data.append({\n",
47
+ " 'GPU': int(parts[0]),\n",
48
+ " 'Name': parts[1],\n",
49
+ " 'GPU_Util_%': float(parts[2]) if parts[2] != '[Not Supported]' else 0,\n",
50
+ " 'Mem_Util_%': float(parts[3]) if parts[3] != '[Not Supported]' else 0,\n",
51
+ " 'Mem_Used_MB': float(parts[4]),\n",
52
+ " 'Mem_Total_MB': float(parts[5]),\n",
53
+ " 'Temp_C': float(parts[6]) if parts[6] != '[Not Supported]' else 0,\n",
54
+ " 'Power_W': float(parts[7]) if parts[7] != '[Not Supported]' else 0,\n",
55
+ " 'Power_Limit_W': float(parts[8]) if parts[8] != '[Not Supported]' else 0\n",
56
+ " })\n",
57
+ " \n",
58
+ " return gpu_data\n",
59
+ " except Exception as e:\n",
60
+ " print(f\"Error getting GPU stats: {e}\")\n",
61
+ " return []\n",
62
+ "\n",
63
+ "def monitor_gpu(duration_minutes=60, refresh_seconds=2):\n",
64
+ " \"\"\"Monitor GPU utilization in real-time\"\"\"\n",
65
+ " \n",
66
+ " print(\"πŸš€ GPU Utilization Monitor - H200 Training Analysis\")\n",
67
+ " print(\"=\" * 70)\n",
68
+ " \n",
69
+ " start_time = time.time()\n",
70
+ " end_time = start_time + (duration_minutes * 60)\n",
71
+ " \n",
72
+ " # Store history for plotting\n",
73
+ " history = []\n",
74
+ " \n",
75
+ " try:\n",
76
+ " while time.time() < end_time:\n",
77
+ " clear_output(wait=True)\n",
78
+ " \n",
79
+ " # Get current stats\n",
80
+ " gpu_stats = get_gpu_stats()\n",
81
+ " timestamp = datetime.now()\n",
82
+ " \n",
83
+ " if gpu_stats:\n",
84
+ " # Add to history\n",
85
+ " for gpu in gpu_stats:\n",
86
+ " gpu['timestamp'] = timestamp\n",
87
+ " history.append(gpu.copy())\n",
88
+ " \n",
89
+ " # Display current stats\n",
90
+ " print(\"πŸš€ GPU Utilization Monitor - H200 Training Analysis\")\n",
91
+ " print(\"=\" * 70)\n",
92
+ " print(f\"⏰ Monitoring Time: {(time.time() - start_time)/60:.1f}/{duration_minutes} minutes\")\n",
93
+ " print(f\"πŸ• Current Time: {timestamp.strftime('%H:%M:%S')}\")\n",
94
+ " print()\n",
95
+ " \n",
96
+ " for gpu in gpu_stats:\n",
97
+ " name = gpu['Name']\n",
98
+ " gpu_util = gpu['GPU_Util_%']\n",
99
+ " mem_used = gpu['Mem_Used_MB'] / 1024 # Convert to GB\n",
100
+ " mem_total = gpu['Mem_Total_MB'] / 1024\n",
101
+ " mem_percent = (mem_used / mem_total) * 100\n",
102
+ " temp = gpu['Temp_C']\n",
103
+ " power = gpu['Power_W']\n",
104
+ " power_limit = gpu['Power_Limit_W']\n",
105
+ " power_percent = (power / power_limit) * 100 if power_limit > 0 else 0\n",
106
+ " \n",
107
+ " print(f\"πŸ”₯ GPU {gpu['GPU']}: {name}\")\n",
108
+ " print(f\" πŸ’» Compute Utilization: {gpu_util:6.1f}% {'🟒' if gpu_util > 90 else '🟑' if gpu_util > 70 else 'πŸ”΄'}\")\n",
109
+ " print(f\" 🧠 Memory: {mem_used:6.1f}GB / {mem_total:6.1f}GB ({mem_percent:5.1f}%) {'🟒' if mem_percent > 95 else '🟑' if mem_percent > 80 else 'πŸ”΄'}\")\n",
110
+ " print(f\" 🌑️ Temperature: {temp:5.1f}Β°C {'🟒' if temp < 80 else '🟑' if temp < 90 else 'πŸ”΄'}\")\n",
111
+ " print(f\" ⚑ Power: {power:6.1f}W / {power_limit:6.1f}W ({power_percent:5.1f}%) {'🟒' if power_percent > 80 else '🟑' if power_percent > 60 else 'πŸ”΄'}\")\n",
112
+ " print()\n",
113
+ " \n",
114
+ " # Analysis\n",
115
+ " avg_gpu_util = sum(gpu['GPU_Util_%'] for gpu in gpu_stats) / len(gpu_stats)\n",
116
+ " avg_mem_util = sum((gpu['Mem_Used_MB']/gpu['Mem_Total_MB'])*100 for gpu in gpu_stats) / len(gpu_stats)\n",
117
+ " \n",
118
+ " print(\"πŸ“Š UTILIZATION ANALYSIS:\")\n",
119
+ " if avg_gpu_util > 90 and avg_mem_util > 95:\n",
120
+ " print(\" βœ… EXCELLENT: GPU is fully utilized!\")\n",
121
+ " elif avg_gpu_util > 70 and avg_mem_util > 80:\n",
122
+ " print(\" ⚠️ GOOD: GPU is well utilized but could be optimized\")\n",
123
+ " else:\n",
124
+ " print(\" ❌ UNDERUTILIZED: GPU has significant unused capacity\")\n",
125
+ " \n",
126
+ " print(f\" πŸ“ˆ Average GPU Compute: {avg_gpu_util:.1f}%\")\n",
127
+ " print(f\" πŸ’Ύ Average Memory Usage: {avg_mem_util:.1f}%\")\n",
128
+ " \n",
129
+ " # Training efficiency indicators\n",
130
+ " if avg_gpu_util < 70:\n",
131
+ " print(\"\\nπŸ”§ OPTIMIZATION SUGGESTIONS:\")\n",
132
+ " print(\" β€’ Increase batch size\")\n",
133
+ " print(\" β€’ Reduce gradient accumulation steps\")\n",
134
+ " print(\" β€’ Check if CPU is bottlenecking data loading\")\n",
135
+ " print(\" β€’ Increase dataloader workers\")\n",
136
+ " \n",
137
+ " # Keep only last 100 data points for plotting\n",
138
+ " if len(history) > 100:\n",
139
+ " history = history[-100:]\n",
140
+ " \n",
141
+ " else:\n",
142
+ " print(\"❌ Could not retrieve GPU statistics\")\n",
143
+ " \n",
144
+ " time.sleep(refresh_seconds)\n",
145
+ " \n",
146
+ " except KeyboardInterrupt:\n",
147
+ " print(\"\\n⏹️ Monitoring stopped by user\")\n",
148
+ " \n",
149
+ " # Plot summary if we have history\n",
150
+ " if len(history) > 5:\n",
151
+ " plot_utilization_summary(history)\n",
152
+ "\n",
153
+ "def plot_utilization_summary(history):\n",
154
+ " \"\"\"Plot utilization summary\"\"\"\n",
155
+ " clear_output(wait=True)\n",
156
+ " \n",
157
+ " df = pd.DataFrame(history)\n",
158
+ " \n",
159
+ " if not df.empty:\n",
160
+ " fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n",
161
+ " fig.suptitle('H200 GPU Utilization Summary', fontsize=16)\n",
162
+ " \n",
163
+ " # GPU Utilization\n",
164
+ " axes[0,0].plot(df['timestamp'], df['GPU_Util_%'], 'b-', linewidth=2)\n",
165
+ " axes[0,0].set_title('GPU Compute Utilization (%)')\n",
166
+ " axes[0,0].set_ylabel('Utilization %')\n",
167
+ " axes[0,0].grid(True, alpha=0.3)\n",
168
+ " axes[0,0].axhline(y=90, color='g', linestyle='--', alpha=0.7, label='Target >90%')\n",
169
+ " axes[0,0].legend()\n",
170
+ " \n",
171
+ " # Memory Utilization\n",
172
+ " mem_percent = (df['Mem_Used_MB'] / df['Mem_Total_MB']) * 100\n",
173
+ " axes[0,1].plot(df['timestamp'], mem_percent, 'r-', linewidth=2)\n",
174
+ " axes[0,1].set_title('Memory Utilization (%)')\n",
175
+ " axes[0,1].set_ylabel('Memory %')\n",
176
+ " axes[0,1].grid(True, alpha=0.3)\n",
177
+ " axes[0,1].axhline(y=95, color='g', linestyle='--', alpha=0.7, label='Target >95%')\n",
178
+ " axes[0,1].legend()\n",
179
+ " \n",
180
+ " # Temperature\n",
181
+ " axes[1,0].plot(df['timestamp'], df['Temp_C'], 'orange', linewidth=2)\n",
182
+ " axes[1,0].set_title('Temperature (Β°C)')\n",
183
+ " axes[1,0].set_ylabel('Temperature Β°C')\n",
184
+ " axes[1,0].grid(True, alpha=0.3)\n",
185
+ " axes[1,0].axhline(y=80, color='r', linestyle='--', alpha=0.7, label='Caution >80Β°C')\n",
186
+ " axes[1,0].legend()\n",
187
+ " \n",
188
+ " # Power Usage\n",
189
+ " power_percent = (df['Power_W'] / df['Power_Limit_W']) * 100\n",
190
+ " axes[1,1].plot(df['timestamp'], power_percent, 'purple', linewidth=2)\n",
191
+ " axes[1,1].set_title('Power Usage (%)')\n",
192
+ " axes[1,1].set_ylabel('Power %')\n",
193
+ " axes[1,1].grid(True, alpha=0.3)\n",
194
+ " axes[1,1].axhline(y=80, color='g', linestyle='--', alpha=0.7, label='Target >80%')\n",
195
+ " axes[1,1].legend()\n",
196
+ " \n",
197
+ " plt.tight_layout()\n",
198
+ " plt.show()\n",
199
+ " \n",
200
+ " # Print summary statistics\n",
201
+ " print(\"\\nπŸ“Š TRAINING SESSION SUMMARY:\")\n",
202
+ " print(\"=\" * 50)\n",
203
+ " print(f\"Average GPU Utilization: {df['GPU_Util_%'].mean():.1f}% (Target: >90%)\")\n",
204
+ " print(f\"Average Memory Usage: {mem_percent.mean():.1f}% (Target: >95%)\")\n",
205
+ " print(f\"Average Temperature: {df['Temp_C'].mean():.1f}Β°C (Safe: <80Β°C)\")\n",
206
+ " print(f\"Average Power Usage: {power_percent.mean():.1f}% (Target: >80%)\")\n",
207
+ " print(f\"Max Memory Used: {df['Mem_Used_MB'].max()/1024:.1f}GB\")\n",
208
+ "\n",
209
+ "# Quick one-time check function\n",
210
+ "def quick_check():\n",
211
+ " \"\"\"Quick one-time GPU status check\"\"\"\n",
212
+ " gpu_stats = get_gpu_stats()\n",
213
+ " \n",
214
+ " if gpu_stats:\n",
215
+ " for gpu in gpu_stats:\n",
216
+ " mem_used_gb = gpu['Mem_Used_MB'] / 1024\n",
217
+ " mem_total_gb = gpu['Mem_Total_MB'] / 1024\n",
218
+ " mem_percent = (mem_used_gb / mem_total_gb) * 100\n",
219
+ " \n",
220
+ " print(f\"πŸ”₯ GPU {gpu['GPU']}: {gpu['Name']}\")\n",
221
+ " print(f\" πŸ’» Compute: {gpu['GPU_Util_%']:.1f}%\")\n",
222
+ " print(f\" 🧠 Memory: {mem_used_gb:.1f}GB/{mem_total_gb:.1f}GB ({mem_percent:.1f}%)\")\n",
223
+ " print(f\" 🌑️ Temp: {gpu['Temp_C']:.1f}°C\")\n",
224
+ " print(f\" ⚑ Power: {gpu['Power_W']:.1f}W\")\n",
225
+ "\n",
226
+ "# Usage examples:\n",
227
+ "print(\"πŸš€ H200 GPU Monitor Ready!\")\n",
228
+ "print(\"\\nUsage:\")\n",
229
+ "print(\"quick_check() # One-time status check\")\n",
230
+ "print(\"monitor_gpu(30, 2) # Monitor for 30 minutes, refresh every 2 seconds\")\n",
231
+ "print(\"monitor_gpu(60) # Monitor for 1 hour with default settings\")"
232
+ ]
233
+ },
234
+ {
235
+ "cell_type": "code",
236
+ "execution_count": null,
237
+ "id": "ed5c7a88-eddf-4c8d-afac-2bda4bc3dcb9",
238
+ "metadata": {},
239
+ "outputs": [
240
+ {
241
+ "name": "stdout",
242
+ "output_type": "stream",
243
+ "text": [
244
+ "πŸš€ GPU Utilization Monitor - H200 Training Analysis\n",
245
+ "======================================================================\n",
246
+ "⏰ Monitoring Time: 1.6/30 minutes\n",
247
+ "πŸ• Current Time: 22:44:08\n",
248
+ "\n",
249
+ "πŸ”₯ GPU 0: NVIDIA H200 NVL\n",
250
+ " πŸ’» Compute Utilization: 100.0% 🟒\n",
251
+ " 🧠 Memory: 138.7GB / 140.4GB ( 98.8%) 🟒\n",
252
+ " 🌑️ Temperature: 83.0°C 🟑\n",
253
+ " ⚑ Power: 540.7W / 600.0W ( 90.1%) 🟒\n",
254
+ "\n",
255
+ "πŸ“Š UTILIZATION ANALYSIS:\n",
256
+ " βœ… EXCELLENT: GPU is fully utilized!\n",
257
+ " πŸ“ˆ Average GPU Compute: 100.0%\n",
258
+ " πŸ’Ύ Average Memory Usage: 98.8%\n"
259
+ ]
260
+ }
261
+ ],
262
+ "source": [
263
+ "monitor_gpu(30, 2) "
264
+ ]
265
+ },
266
+ {
267
+ "cell_type": "code",
268
+ "execution_count": null,
269
+ "id": "910a1785-c220-4bbc-810c-8df4ca8e9931",
270
+ "metadata": {},
271
+ "outputs": [],
272
+ "source": []
273
+ }
274
+ ],
275
+ "metadata": {
276
+ "kernelspec": {
277
+ "display_name": "Python 3 (ipykernel)",
278
+ "language": "python",
279
+ "name": "python3"
280
+ },
281
+ "language_info": {
282
+ "codemirror_mode": {
283
+ "name": "ipython",
284
+ "version": 3
285
+ },
286
+ "file_extension": ".py",
287
+ "mimetype": "text/x-python",
288
+ "name": "python",
289
+ "nbconvert_exporter": "python",
290
+ "pygments_lexer": "ipython3",
291
+ "version": "3.10.12"
292
+ }
293
+ },
294
+ "nbformat": 4,
295
+ "nbformat_minor": 5
296
+ }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bbc192d0f96870619f0cf8660469dcb3e58db1859ebc98ebe73bb2ce4f8bca14
3
  size 960760928
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ec865c41f048ddedd7437b869d7037cf8898242aaff867df014f34b8c3ddc81
3
  size 960760928