Wendy
commited on
Upload target_all.py with huggingface_hub
Browse files- target_all.py +95 -0
target_all.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
import time
|
4 |
+
import subprocess
|
5 |
+
import argparse
|
6 |
+
|
7 |
+
def parse_args():
|
8 |
+
parser = argparse.ArgumentParser(description='Matrix multiplication')
|
9 |
+
parser.add_argument('--gpus', help='List of GPU IDs', required=True, type=int, nargs='+')
|
10 |
+
parser.add_argument('--size', help='Matrix size', required=True, type=int)
|
11 |
+
parser.add_argument('--interval', help='Sleep interval', required=True, type=float)
|
12 |
+
args = parser.parse_args()
|
13 |
+
return args
|
14 |
+
|
15 |
+
import math
|
16 |
+
|
17 |
+
def calculate_matrix_size(memory_gb, num_matrices=2):
|
18 |
+
# 将 GB 转换为字节
|
19 |
+
|
20 |
+
memory_bytes = memory_gb * (1024 ** 3)
|
21 |
+
|
22 |
+
# 每个矩阵和结果矩阵的内存需求
|
23 |
+
bytes_per_matrix = memory_bytes / (num_matrices + 1)
|
24 |
+
|
25 |
+
# 计算矩阵的大小
|
26 |
+
|
27 |
+
size_squared = bytes_per_matrix / (4 * 2) # 4 字节每个浮点数,3 是矩阵和结果矩阵的总数
|
28 |
+
size = math.sqrt(size_squared)
|
29 |
+
|
30 |
+
return int(size)
|
31 |
+
|
32 |
+
# 输入显存大小
|
33 |
+
# memory_gb = 12
|
34 |
+
# size = calculate_matrix_size(memory_gb)
|
35 |
+
# print(f"Size for {memory_gb}G memory: {size}")
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
def get_gpu_memory(gpu_ids):
|
43 |
+
memory_list = []
|
44 |
+
for gpu_id in gpu_ids:
|
45 |
+
try:
|
46 |
+
result = subprocess.run(
|
47 |
+
['nvidia-smi', '--query-gpu=memory.free', '--format=csv,noheader,nounits', '-i', str(gpu_id)],
|
48 |
+
stdout=subprocess.PIPE,
|
49 |
+
stderr=subprocess.PIPE,
|
50 |
+
check=True
|
51 |
+
)
|
52 |
+
memory_free = int(result.stdout.decode().strip())
|
53 |
+
memory_list.append(memory_free - args.size ) # Adjust memory to account for overhead
|
54 |
+
|
55 |
+
except subprocess.CalledProcessError as e:
|
56 |
+
print(f"Error querying GPU {gpu_id}: {e}")
|
57 |
+
memory_list.append(None) # If there's an error, append None to the list
|
58 |
+
return memory_list
|
59 |
+
|
60 |
+
def matrix_multiplication(args):
|
61 |
+
a_list, b_list, result = [], [], []
|
62 |
+
|
63 |
+
memory_list = get_gpu_memory(args.gpus)
|
64 |
+
print("Remaining GPU memory (MB):", memory_list)
|
65 |
+
|
66 |
+
for index, gpu_id in enumerate(args.gpus):
|
67 |
+
if memory_list[index] > 0 :
|
68 |
+
|
69 |
+
memory_gb = memory_list[index] // 1024 # Assuming 8 bytes per float32 element
|
70 |
+
|
71 |
+
size = calculate_matrix_size(memory_gb)
|
72 |
+
print(memory_gb, size)
|
73 |
+
a_list.append(torch.rand(size, size, device=gpu_id))
|
74 |
+
b_list.append(torch.rand(size, size, device=gpu_id))
|
75 |
+
result.append(torch.empty(size, size, device=gpu_id))
|
76 |
+
|
77 |
+
else:
|
78 |
+
print(f"GPU {gpu_id} 的显存不足或出现错误,跳过该 GPU。")
|
79 |
+
a_list.append(None)
|
80 |
+
b_list.append(None)
|
81 |
+
result.append(None)
|
82 |
+
|
83 |
+
while True:
|
84 |
+
for i in range(len(args.gpus)):
|
85 |
+
if a_list[i] is not None and b_list[i] is not None:
|
86 |
+
result[i] = torch.matmul(a_list[i], b_list[i])
|
87 |
+
time.sleep(args.interval)
|
88 |
+
|
89 |
+
if __name__ == "__main__":
|
90 |
+
args = parse_args()
|
91 |
+
args.gpus = [5,6,7]
|
92 |
+
|
93 |
+
matrix_multiplication(args)
|
94 |
+
|
95 |
+
|