LLaVA-Select / target_all.py
Wendy
Upload target_all.py with huggingface_hub
cb187a1 verified
raw
history blame
2.95 kB
import torch
import time
import subprocess
import argparse
def parse_args():
parser = argparse.ArgumentParser(description='Matrix multiplication')
parser.add_argument('--gpus', help='List of GPU IDs', required=True, type=int, nargs='+')
parser.add_argument('--size', help='Matrix size', required=True, type=int)
parser.add_argument('--interval', help='Sleep interval', required=True, type=float)
args = parser.parse_args()
return args
import math
def calculate_matrix_size(memory_gb, num_matrices=2):
# 将 GB 转换为字节
memory_bytes = memory_gb * (1024 ** 3)
# 每个矩阵和结果矩阵的内存需求
bytes_per_matrix = memory_bytes / (num_matrices + 1)
# 计算矩阵的大小
size_squared = bytes_per_matrix / (4 * 2) # 4 字节每个浮点数,3 是矩阵和结果矩阵的总数
size = math.sqrt(size_squared)
return int(size)
# 输入显存大小
# memory_gb = 12
# size = calculate_matrix_size(memory_gb)
# print(f"Size for {memory_gb}G memory: {size}")
def get_gpu_memory(gpu_ids):
memory_list = []
for gpu_id in gpu_ids:
try:
result = subprocess.run(
['nvidia-smi', '--query-gpu=memory.free', '--format=csv,noheader,nounits', '-i', str(gpu_id)],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True
)
memory_free = int(result.stdout.decode().strip())
memory_list.append(memory_free - args.size ) # Adjust memory to account for overhead
except subprocess.CalledProcessError as e:
print(f"Error querying GPU {gpu_id}: {e}")
memory_list.append(None) # If there's an error, append None to the list
return memory_list
def matrix_multiplication(args):
a_list, b_list, result = [], [], []
memory_list = get_gpu_memory(args.gpus)
print("Remaining GPU memory (MB):", memory_list)
for index, gpu_id in enumerate(args.gpus):
if memory_list[index] > 0 :
memory_gb = memory_list[index] // 1024 # Assuming 8 bytes per float32 element
size = calculate_matrix_size(memory_gb)
print(memory_gb, size)
a_list.append(torch.rand(size, size, device=gpu_id))
b_list.append(torch.rand(size, size, device=gpu_id))
result.append(torch.empty(size, size, device=gpu_id))
else:
print(f"GPU {gpu_id} 的显存不足或出现错误,跳过该 GPU。")
a_list.append(None)
b_list.append(None)
result.append(None)
while True:
for i in range(len(args.gpus)):
if a_list[i] is not None and b_list[i] is not None:
result[i] = torch.matmul(a_list[i], b_list[i])
time.sleep(args.interval)
if __name__ == "__main__":
args = parse_args()
args.gpus = [5,6,7]
matrix_multiplication(args)