Spaces:

sparse-generative-ai
/

open-moe-llm-leaderboard

Runtime error

App Files Files Community

open-moe-llm-leaderboard / src /utils.py

AppleSwing

Fix some bugs

9ffef81 over 1 year ago

raw

history blame

8.06 kB

	import pandas as pd
	from huggingface_hub import snapshot_download
	import subprocess
	import re
	import os
	import GPUtil

	try:
	from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
	except:
	print("local debug: from display.utils")
	from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name

	MEM_BW_DICT ={
	"NVIDIA-A100-PCIe-80GB": 1935,
	"NVIDIA-A100-SXM-80GB": 2039,
	"NVIDIA-H100-PCIe-80GB": 2039,
	"NVIDIA-RTX-A5000-24GB": 768
	}

	PEAK_FLOPS_DICT = {
	"float32":{
	"NVIDIA-A100-PCIe-80GB": 312e12,
	"NVIDIA-A100-SXM-80GB": 312e12,
	"NVIDIA-H100-PCIe-80GB": 756e12,
	"NVIDIA-RTX-A5000-24GB": 222.2e12
	},
	"float16":{
	"NVIDIA-A100-PCIe-80GB": 624e12,
	"NVIDIA-A100-SXM-80GB": 624e12,
	"NVIDIA-H100-PCIe-80GB": 1513e12,
	"NVIDIA-RTX-A5000-24GB": 444.4e12
	},
	"8bit":{
	"NVIDIA-A100-PCIe-80GB": 1248e12,
	"NVIDIA-A100-SXM-80GB": 1248e12,
	"NVIDIA-H100-PCIe-80GB": 3026e12,
	"NVIDIA-RTX-A5000-24GB": 889e12
	},
	"4bit": {
	"NVIDIA-A100-PCIe-80GB": 2496e12,
	"NVIDIA-A100-SXM-80GB": 2496e12,
	"NVIDIA-H100-PCIe-80GB": 6052e12,
	"NVIDIA-RTX-A5000-24GB": 1778e12
	}

	}

	def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
	for i in range(10):
	try:
	snapshot_download(
	repo_id=repo_id, revision=revision, local_dir=local_dir, repo_type=repo_type, max_workers=max_workers
	)
	return
	except Exception as e:
	print(f"Failed to download {repo_id} at {revision} with error: {e}. Retrying...")
	import time

	time.sleep(60)
	return


	def get_dataset_url(row):
	dataset_name = row["Benchmark"]
	dataset_url = row["Dataset Link"]
	benchmark = f'<a target="_blank" href="{dataset_url}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{dataset_name}</a>'
	return benchmark


	def get_dataset_summary_table(file_path):
	df = pd.read_csv(file_path)

	df["Benchmark"] = df.apply(lambda x: get_dataset_url(x), axis=1)

	df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]]

	return df

	def parse_nvidia_smi():
	visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
	if visible_devices is not None:
	gpu_indices = visible_devices.split(',')
	else:
	# Query all GPU indices if CUDA_VISIBLE_DEVICES is not set
	result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True)
	if result.returncode != 0:
	print("Failed to query GPU indices.")
	return []
	gpu_indices = result.stdout.strip().split('\n')
	# print(f"gpu_indices: {gpu_indices}")
	gpu_stats = []

	gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\\|\s+(\d+)MiB / \d+MiB\s+\\|\s+(\d+)%')
	gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')

	gpu_name = ""
	for index in gpu_indices:
	result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True)
	output = result.stdout.strip()
	lines = output.split("\n")
	for line in lines:
	match = gpu_info_pattern.search(line)
	name_match = gpu_name_pattern.search(line)
	gpu_info = {}
	if name_match:
	gpu_name = name_match.group(1).strip()
	if match:
	temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
	gpu_info.update({
	GPU_TEMP: temp,
	GPU_Power: power_usage,
	GPU_Mem: round(mem_usage / 1024, 2),
	GPU_Util: gpu_util
	})

	if len(gpu_info) >= 4:
	gpu_stats.append(gpu_info)
	# print(f"gpu_stats: {gpu_stats}")
	gpu_name = f"{len(gpu_stats)}x{gpu_name}"
	gpu_stats_total = {
	GPU_TEMP: 0,
	GPU_Power: 0,
	GPU_Mem: 0,
	GPU_Util: 0,
	GPU_Name: gpu_name
	}
	for gpu_stat in gpu_stats:
	gpu_stats_total[GPU_TEMP] += gpu_stat[GPU_TEMP]
	gpu_stats_total[GPU_Power] += gpu_stat[GPU_Power]
	gpu_stats_total[GPU_Mem] += gpu_stat[GPU_Mem]
	gpu_stats_total[GPU_Util] += gpu_stat[GPU_Util]
	gpu_stats_total[GPU_Mem] = gpu_stats_total[GPU_Mem] # G
	gpu_stats_total[GPU_TEMP] /= len(gpu_stats)
	gpu_stats_total[GPU_Power] /= len(gpu_stats)
	gpu_stats_total[GPU_Util] /= len(gpu_stats)
	return [gpu_stats_total]

	def monitor_gpus(stop_event, interval, stats_list):
	while not stop_event.is_set():
	gpu_stats = parse_nvidia_smi()
	if gpu_stats:
	stats_list.extend(gpu_stats)
	stop_event.wait(interval)

	def analyze_gpu_stats(stats_list):
	# Check if the stats_list is empty, and return None if it is
	if not stats_list:
	return None

	# Initialize dictionaries to store the stats
	avg_stats = {}
	max_stats = {}

	# Calculate average stats, excluding 'GPU_Mem'
	for key in stats_list[0].keys():
	if key != GPU_Mem and key != GPU_Name:
	total = sum(d[key] for d in stats_list)
	avg_stats[key] = total / len(stats_list)

	# Calculate max stats for 'GPU_Mem'
	max_stats[GPU_Mem] = max(d[GPU_Mem] for d in stats_list)
	if GPU_Name in stats_list[0]:
	avg_stats[GPU_Name] = stats_list[0][GPU_Name]
	# Update average stats with max GPU memory usage
	avg_stats.update(max_stats)

	return avg_stats

	def get_gpu_number():
	visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
	if visible_devices is not None:
	gpu_indices = visible_devices.split(',')
	else:
	# Query all GPU indices if CUDA_VISIBLE_DEVICES is not set
	result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True)
	if result.returncode != 0:
	print("Failed to query GPU indices.")
	return []
	gpu_indices = result.stdout.strip().split('\n')
	# print(f"gpu_indices: {gpu_indices}")
	gpu_stats = []

	gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\\|\s+(\d+)MiB / \d+MiB\s+\\|\s+(\d+)%')

	for index in gpu_indices:
	result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True)
	output = result.stdout.strip()
	lines = output.split("\n")
	for line in lines:
	match = gpu_info_pattern.search(line)
	gpu_info = {}
	if match:
	temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
	gpu_info.update({
	GPU_TEMP: temp,
	GPU_Power: power_usage,
	GPU_Mem: round(mem_usage / 1024, 2),
	GPU_Util: gpu_util
	})

	if len(gpu_info) >= 4:
	gpu_stats.append(gpu_info)

	return len(gpu_stats)

	def get_gpu_details():
	gpus = GPUtil.getGPUs()
	gpu = gpus[0]
	name = gpu.name.replace(" ", "-")
	# Convert memory from MB to GB and round to nearest whole number
	memory_gb = round(gpu.memoryTotal / 1024)
	memory = f"{memory_gb}GB"
	formatted_name = f"{name}-{memory}"
	return formatted_name

	def get_peak_bw(gpu_name):
	return MEM_BW_DICT[gpu_name]

	def get_peak_flops(gpu_name, precision):
	return PEAK_FLOPS_DICT[precision][gpu_name]

	def transfer_precision2bytes(precision):
	if precision == "float32":
	return 4
	elif precision == "float16":
	return 2
	elif precision == "8bit":
	return 1
	elif precision == "4bit":
	return 0.5
	else:
	raise ValueError(f"Unsupported precision: {precision}")

	if __name__ == "__main__":
	print(analyze_gpu_stats(parse_nvidia_smi()))