Wendy commited on
Commit
cb187a1
·
verified ·
1 Parent(s): 6123c5a

Upload target_all.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. target_all.py +95 -0
target_all.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import time
4
+ import subprocess
5
+ import argparse
6
+
7
+ def parse_args():
8
+ parser = argparse.ArgumentParser(description='Matrix multiplication')
9
+ parser.add_argument('--gpus', help='List of GPU IDs', required=True, type=int, nargs='+')
10
+ parser.add_argument('--size', help='Matrix size', required=True, type=int)
11
+ parser.add_argument('--interval', help='Sleep interval', required=True, type=float)
12
+ args = parser.parse_args()
13
+ return args
14
+
15
+ import math
16
+
17
+ def calculate_matrix_size(memory_gb, num_matrices=2):
18
+ # 将 GB 转换为字节
19
+
20
+ memory_bytes = memory_gb * (1024 ** 3)
21
+
22
+ # 每个矩阵和结果矩阵的内存需求
23
+ bytes_per_matrix = memory_bytes / (num_matrices + 1)
24
+
25
+ # 计算矩阵的大小
26
+
27
+ size_squared = bytes_per_matrix / (4 * 2) # 4 字节每个浮点数,3 是矩阵和结果矩阵的总数
28
+ size = math.sqrt(size_squared)
29
+
30
+ return int(size)
31
+
32
+ # 输入显存大小
33
+ # memory_gb = 12
34
+ # size = calculate_matrix_size(memory_gb)
35
+ # print(f"Size for {memory_gb}G memory: {size}")
36
+
37
+
38
+
39
+
40
+
41
+
42
+ def get_gpu_memory(gpu_ids):
43
+ memory_list = []
44
+ for gpu_id in gpu_ids:
45
+ try:
46
+ result = subprocess.run(
47
+ ['nvidia-smi', '--query-gpu=memory.free', '--format=csv,noheader,nounits', '-i', str(gpu_id)],
48
+ stdout=subprocess.PIPE,
49
+ stderr=subprocess.PIPE,
50
+ check=True
51
+ )
52
+ memory_free = int(result.stdout.decode().strip())
53
+ memory_list.append(memory_free - args.size ) # Adjust memory to account for overhead
54
+
55
+ except subprocess.CalledProcessError as e:
56
+ print(f"Error querying GPU {gpu_id}: {e}")
57
+ memory_list.append(None) # If there's an error, append None to the list
58
+ return memory_list
59
+
60
+ def matrix_multiplication(args):
61
+ a_list, b_list, result = [], [], []
62
+
63
+ memory_list = get_gpu_memory(args.gpus)
64
+ print("Remaining GPU memory (MB):", memory_list)
65
+
66
+ for index, gpu_id in enumerate(args.gpus):
67
+ if memory_list[index] > 0 :
68
+
69
+ memory_gb = memory_list[index] // 1024 # Assuming 8 bytes per float32 element
70
+
71
+ size = calculate_matrix_size(memory_gb)
72
+ print(memory_gb, size)
73
+ a_list.append(torch.rand(size, size, device=gpu_id))
74
+ b_list.append(torch.rand(size, size, device=gpu_id))
75
+ result.append(torch.empty(size, size, device=gpu_id))
76
+
77
+ else:
78
+ print(f"GPU {gpu_id} 的显存不足或出现错误,跳过该 GPU。")
79
+ a_list.append(None)
80
+ b_list.append(None)
81
+ result.append(None)
82
+
83
+ while True:
84
+ for i in range(len(args.gpus)):
85
+ if a_list[i] is not None and b_list[i] is not None:
86
+ result[i] = torch.matmul(a_list[i], b_list[i])
87
+ time.sleep(args.interval)
88
+
89
+ if __name__ == "__main__":
90
+ args = parse_args()
91
+ args.gpus = [5,6,7]
92
+
93
+ matrix_multiplication(args)
94
+
95
+