import torch import time import subprocess import argparse def parse_args(): parser = argparse.ArgumentParser(description='Matrix multiplication') parser.add_argument('--gpus', help='List of GPU IDs', required=True, type=int, nargs='+') parser.add_argument('--size', help='Matrix size', required=True, type=int) parser.add_argument('--interval', help='Sleep interval', required=True, type=float) args = parser.parse_args() return args import math def calculate_matrix_size(memory_gb, num_matrices=2): # 将 GB 转换为字节 memory_bytes = memory_gb * (1024 ** 3) # 每个矩阵和结果矩阵的内存需求 bytes_per_matrix = memory_bytes / (num_matrices + 1) # 计算矩阵的大小 size_squared = bytes_per_matrix / (4 * 2) # 4 字节每个浮点数,3 是矩阵和结果矩阵的总数 size = math.sqrt(size_squared) return int(size) # 输入显存大小 # memory_gb = 12 # size = calculate_matrix_size(memory_gb) # print(f"Size for {memory_gb}G memory: {size}") def get_gpu_memory(gpu_ids): memory_list = [] for gpu_id in gpu_ids: try: result = subprocess.run( ['nvidia-smi', '--query-gpu=memory.free', '--format=csv,noheader,nounits', '-i', str(gpu_id)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True ) memory_free = int(result.stdout.decode().strip()) memory_list.append(memory_free - args.size ) # Adjust memory to account for overhead except subprocess.CalledProcessError as e: print(f"Error querying GPU {gpu_id}: {e}") memory_list.append(None) # If there's an error, append None to the list return memory_list def matrix_multiplication(args): a_list, b_list, result = [], [], [] memory_list = get_gpu_memory(args.gpus) print("Remaining GPU memory (MB):", memory_list) for index, gpu_id in enumerate(args.gpus): if memory_list[index] > 0 : memory_gb = memory_list[index] // 1024 # Assuming 8 bytes per float32 element size = calculate_matrix_size(memory_gb) print(memory_gb, size) a_list.append(torch.rand(size, size, device=gpu_id)) b_list.append(torch.rand(size, size, device=gpu_id)) result.append(torch.empty(size, size, device=gpu_id)) else: print(f"GPU {gpu_id} 的显存不足或出现错误,跳过该 GPU。") a_list.append(None) b_list.append(None) result.append(None) while True: for i in range(len(args.gpus)): if a_list[i] is not None and b_list[i] is not None: result[i] = torch.matmul(a_list[i], b_list[i]) time.sleep(args.interval) if __name__ == "__main__": args = parse_args() args.gpus = [5,6,7] matrix_multiplication(args)