|
|
|
import torch |
|
import time |
|
import subprocess |
|
import argparse |
|
|
|
def parse_args(): |
|
parser = argparse.ArgumentParser(description='Matrix multiplication') |
|
parser.add_argument('--gpus', help='List of GPU IDs', required=True, type=int, nargs='+') |
|
parser.add_argument('--size', help='Matrix size', required=True, type=int) |
|
parser.add_argument('--interval', help='Sleep interval', required=True, type=float) |
|
args = parser.parse_args() |
|
return args |
|
|
|
import math |
|
|
|
def calculate_matrix_size(memory_gb, num_matrices=2): |
|
|
|
|
|
memory_bytes = memory_gb * (1024 ** 3) |
|
|
|
|
|
bytes_per_matrix = memory_bytes / (num_matrices + 1) |
|
|
|
|
|
|
|
size_squared = bytes_per_matrix / (4 * 2) |
|
size = math.sqrt(size_squared) |
|
|
|
return int(size) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_gpu_memory(gpu_ids): |
|
memory_list = [] |
|
for gpu_id in gpu_ids: |
|
try: |
|
result = subprocess.run( |
|
['nvidia-smi', '--query-gpu=memory.free', '--format=csv,noheader,nounits', '-i', str(gpu_id)], |
|
stdout=subprocess.PIPE, |
|
stderr=subprocess.PIPE, |
|
check=True |
|
) |
|
memory_free = int(result.stdout.decode().strip()) |
|
memory_list.append(memory_free - args.size ) |
|
|
|
except subprocess.CalledProcessError as e: |
|
print(f"Error querying GPU {gpu_id}: {e}") |
|
memory_list.append(None) |
|
return memory_list |
|
|
|
def matrix_multiplication(args): |
|
a_list, b_list, result = [], [], [] |
|
|
|
memory_list = get_gpu_memory(args.gpus) |
|
print("Remaining GPU memory (MB):", memory_list) |
|
|
|
for index, gpu_id in enumerate(args.gpus): |
|
if memory_list[index] > 0 : |
|
|
|
memory_gb = memory_list[index] // 1024 |
|
|
|
size = calculate_matrix_size(memory_gb) |
|
print(memory_gb, size) |
|
a_list.append(torch.rand(size, size, device=gpu_id)) |
|
b_list.append(torch.rand(size, size, device=gpu_id)) |
|
result.append(torch.empty(size, size, device=gpu_id)) |
|
|
|
else: |
|
print(f"GPU {gpu_id} 的显存不足或出现错误,跳过该 GPU。") |
|
a_list.append(None) |
|
b_list.append(None) |
|
result.append(None) |
|
|
|
while True: |
|
for i in range(len(args.gpus)): |
|
if a_list[i] is not None and b_list[i] is not None: |
|
result[i] = torch.matmul(a_list[i], b_list[i]) |
|
time.sleep(args.interval) |
|
|
|
if __name__ == "__main__": |
|
args = parse_args() |
|
args.gpus = [5,6,7] |
|
|
|
matrix_multiplication(args) |
|
|
|
|
|
|