Spaces:
Runtime error
Runtime error
import pandas as pd | |
from huggingface_hub import snapshot_download | |
import subprocess | |
import re | |
import os | |
import GPUtil | |
try: | |
from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name | |
except: | |
print("local debug: from display.utils") | |
from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name | |
MEM_BW_DICT ={ | |
"NVIDIA-A100-PCIe-80GB": 1935, | |
"NVIDIA-A100-SXM-80GB": 2039, | |
"NVIDIA-H100-PCIe-80GB": 2039, | |
"NVIDIA-RTX-A5000-24GB": 768 | |
} | |
PEAK_FLOPS_DICT = { | |
"float32":{ | |
"NVIDIA-A100-PCIe-80GB": 312e12, | |
"NVIDIA-A100-SXM-80GB": 312e12, | |
"NVIDIA-H100-PCIe-80GB": 756e12, | |
"NVIDIA-RTX-A5000-24GB": 222.2e12 | |
}, | |
"float16":{ | |
"NVIDIA-A100-PCIe-80GB": 624e12, | |
"NVIDIA-A100-SXM-80GB": 624e12, | |
"NVIDIA-H100-PCIe-80GB": 1513e12, | |
"NVIDIA-RTX-A5000-24GB": 444.4e12 | |
}, | |
"8bit":{ | |
"NVIDIA-A100-PCIe-80GB": 1248e12, | |
"NVIDIA-A100-SXM-80GB": 1248e12, | |
"NVIDIA-H100-PCIe-80GB": 3026e12, | |
"NVIDIA-RTX-A5000-24GB": 889e12 | |
}, | |
"4bit": { | |
"NVIDIA-A100-PCIe-80GB": 2496e12, | |
"NVIDIA-A100-SXM-80GB": 2496e12, | |
"NVIDIA-H100-PCIe-80GB": 6052e12, | |
"NVIDIA-RTX-A5000-24GB": 1778e12 | |
} | |
} | |
def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers): | |
for i in range(10): | |
try: | |
snapshot_download( | |
repo_id=repo_id, revision=revision, local_dir=local_dir, repo_type=repo_type, max_workers=max_workers | |
) | |
return | |
except Exception as e: | |
print(f"Failed to download {repo_id} at {revision} with error: {e}. Retrying...") | |
import time | |
time.sleep(60) | |
return | |
def get_dataset_url(row): | |
dataset_name = row["Benchmark"] | |
dataset_url = row["Dataset Link"] | |
benchmark = f'<a target="_blank" href="{dataset_url}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{dataset_name}</a>' | |
return benchmark | |
def get_dataset_summary_table(file_path): | |
df = pd.read_csv(file_path) | |
df["Benchmark"] = df.apply(lambda x: get_dataset_url(x), axis=1) | |
df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]] | |
return df | |
def parse_nvidia_smi(): | |
visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None) | |
if visible_devices is not None: | |
gpu_indices = visible_devices.split(',') | |
else: | |
# Query all GPU indices if CUDA_VISIBLE_DEVICES is not set | |
result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True) | |
if result.returncode != 0: | |
print("Failed to query GPU indices.") | |
return [] | |
gpu_indices = result.stdout.strip().split('\n') | |
# print(f"gpu_indices: {gpu_indices}") | |
gpu_stats = [] | |
gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%') | |
gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)') | |
gpu_name = "" | |
for index in gpu_indices: | |
result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True) | |
output = result.stdout.strip() | |
lines = output.split("\n") | |
for line in lines: | |
match = gpu_info_pattern.search(line) | |
name_match = gpu_name_pattern.search(line) | |
gpu_info = {} | |
if name_match: | |
gpu_name = name_match.group(1).strip() | |
if match: | |
temp, power_usage, mem_usage, gpu_util = map(int, match.groups()) | |
gpu_info.update({ | |
GPU_TEMP: temp, | |
GPU_Power: power_usage, | |
GPU_Mem: round(mem_usage / 1024, 2), | |
GPU_Util: gpu_util | |
}) | |
if len(gpu_info) >= 4: | |
gpu_stats.append(gpu_info) | |
# print(f"gpu_stats: {gpu_stats}") | |
gpu_name = f"{len(gpu_stats)}x{gpu_name}" | |
gpu_stats_total = { | |
GPU_TEMP: 0, | |
GPU_Power: 0, | |
GPU_Mem: 0, | |
GPU_Util: 0, | |
GPU_Name: gpu_name | |
} | |
for gpu_stat in gpu_stats: | |
gpu_stats_total[GPU_TEMP] += gpu_stat[GPU_TEMP] | |
gpu_stats_total[GPU_Power] += gpu_stat[GPU_Power] | |
gpu_stats_total[GPU_Mem] += gpu_stat[GPU_Mem] | |
gpu_stats_total[GPU_Util] += gpu_stat[GPU_Util] | |
gpu_stats_total[GPU_Mem] = gpu_stats_total[GPU_Mem] # G | |
gpu_stats_total[GPU_TEMP] /= len(gpu_stats) | |
gpu_stats_total[GPU_Power] /= len(gpu_stats) | |
gpu_stats_total[GPU_Util] /= len(gpu_stats) | |
return [gpu_stats_total] | |
def monitor_gpus(stop_event, interval, stats_list): | |
while not stop_event.is_set(): | |
gpu_stats = parse_nvidia_smi() | |
if gpu_stats: | |
stats_list.extend(gpu_stats) | |
stop_event.wait(interval) | |
def analyze_gpu_stats(stats_list): | |
# Check if the stats_list is empty, and return None if it is | |
if not stats_list: | |
return None | |
# Initialize dictionaries to store the stats | |
avg_stats = {} | |
max_stats = {} | |
# Calculate average stats, excluding 'GPU_Mem' | |
for key in stats_list[0].keys(): | |
if key != GPU_Mem and key != GPU_Name: | |
total = sum(d[key] for d in stats_list) | |
avg_stats[key] = total / len(stats_list) | |
# Calculate max stats for 'GPU_Mem' | |
max_stats[GPU_Mem] = max(d[GPU_Mem] for d in stats_list) | |
if GPU_Name in stats_list[0]: | |
avg_stats[GPU_Name] = stats_list[0][GPU_Name] | |
# Update average stats with max GPU memory usage | |
avg_stats.update(max_stats) | |
return avg_stats | |
def get_gpu_number(): | |
visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None) | |
if visible_devices is not None: | |
gpu_indices = visible_devices.split(',') | |
else: | |
# Query all GPU indices if CUDA_VISIBLE_DEVICES is not set | |
result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True) | |
if result.returncode != 0: | |
print("Failed to query GPU indices.") | |
return [] | |
gpu_indices = result.stdout.strip().split('\n') | |
# print(f"gpu_indices: {gpu_indices}") | |
gpu_stats = [] | |
gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%') | |
for index in gpu_indices: | |
result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True) | |
output = result.stdout.strip() | |
lines = output.split("\n") | |
for line in lines: | |
match = gpu_info_pattern.search(line) | |
gpu_info = {} | |
if match: | |
temp, power_usage, mem_usage, gpu_util = map(int, match.groups()) | |
gpu_info.update({ | |
GPU_TEMP: temp, | |
GPU_Power: power_usage, | |
GPU_Mem: round(mem_usage / 1024, 2), | |
GPU_Util: gpu_util | |
}) | |
if len(gpu_info) >= 4: | |
gpu_stats.append(gpu_info) | |
return len(gpu_stats) | |
def get_gpu_details(): | |
gpus = GPUtil.getGPUs() | |
gpu = gpus[0] | |
name = gpu.name.replace(" ", "-") | |
# Convert memory from MB to GB and round to nearest whole number | |
memory_gb = round(gpu.memoryTotal / 1024) | |
memory = f"{memory_gb}GB" | |
formatted_name = f"{name}-{memory}" | |
return formatted_name | |
def get_peak_bw(gpu_name): | |
return MEM_BW_DICT[gpu_name] | |
def get_peak_flops(gpu_name, precision): | |
return PEAK_FLOPS_DICT[precision][gpu_name] | |
def transfer_precision2bytes(precision): | |
if precision == "float32": | |
return 4 | |
elif precision == "float16": | |
return 2 | |
elif precision == "8bit": | |
return 1 | |
elif precision == "4bit": | |
return 0.5 | |
else: | |
raise ValueError(f"Unsupported precision: {precision}") | |
if __name__ == "__main__": | |
print(analyze_gpu_stats(parse_nvidia_smi())) | |