Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							·
						
						9ffef81
	
1
								Parent(s):
							
							f38163c
								
Fix some bugs
Browse files- backend-cli.py +26 -24
- src/backend/envs.py +1 -1
- src/display/utils.py +1 -0
- src/submission/check_validity.py +2 -1
- src/utils.py +102 -1
    	
        backend-cli.py
    CHANGED
    
    | @@ -17,7 +17,7 @@ from src.backend.manage_requests import EvalRequest | |
| 17 | 
             
            from src.leaderboard.read_evals import EvalResult
         | 
| 18 |  | 
| 19 | 
             
            from src.envs import QUEUE_REPO, RESULTS_REPO, API, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
         | 
| 20 | 
            -
            from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus
         | 
| 21 |  | 
| 22 | 
             
            from src.leaderboard.read_evals import get_raw_eval_results
         | 
| 23 |  | 
| @@ -142,9 +142,6 @@ def request_to_result_name(request: EvalRequest) -> str: | |
| 142 | 
             
            def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
         | 
| 143 | 
             
                batch_size = 1
         | 
| 144 | 
             
                batch_size = eval_request.batch_size
         | 
| 145 | 
            -
                
         | 
| 146 | 
            -
                if args.debug:
         | 
| 147 | 
            -
                    RESULTS_REPO = DEBUG_RESULTS_REPO
         | 
| 148 |  | 
| 149 | 
             
                init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
         | 
| 150 | 
             
                # if init_gpu_info['Mem(M)'] > 500:
         | 
| @@ -388,21 +385,7 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) - | |
| 388 |  | 
| 389 | 
             
                return False
         | 
| 390 |  | 
| 391 | 
            -
             | 
| 392 | 
            -
            def get_gpu_details():
         | 
| 393 | 
            -
                gpus = GPUtil.getGPUs()
         | 
| 394 | 
            -
                gpu = gpus[0]
         | 
| 395 | 
            -
                name = gpu.name.replace(" ", "-")
         | 
| 396 | 
            -
                # Convert memory from MB to GB and round to nearest whole number
         | 
| 397 | 
            -
                memory_gb = round(gpu.memoryTotal / 1024)
         | 
| 398 | 
            -
                memory = f"{memory_gb}GB"
         | 
| 399 | 
            -
                formatted_name = f"{name}-{memory}"
         | 
| 400 | 
            -
                return formatted_name
         | 
| 401 | 
            -
             | 
| 402 | 
             
            def process_pending_requests() -> bool:
         | 
| 403 | 
            -
                if args.debug:
         | 
| 404 | 
            -
                    QUEUE_REPO = DEBUG_QUEUE_REPO
         | 
| 405 | 
            -
                    
         | 
| 406 | 
             
                sanity_checks()
         | 
| 407 | 
             
                print("Processing pending requests")
         | 
| 408 | 
             
                current_pending_status = [PENDING_STATUS]
         | 
| @@ -472,6 +455,7 @@ def get_args(): | |
| 472 | 
             
                parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
         | 
| 473 | 
             
                parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB", 
         | 
| 474 | 
             
                                    help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
         | 
|  | |
| 475 | 
             
                return parser.parse_args()
         | 
| 476 |  | 
| 477 |  | 
| @@ -479,7 +463,7 @@ if __name__ == "__main__": | |
| 479 | 
             
                args = get_args()
         | 
| 480 | 
             
                local_debug = args.debug
         | 
| 481 | 
             
                # debug specific task by ping
         | 
| 482 | 
            -
                if local_debug:
         | 
| 483 | 
             
                    # debug_model_names = [args.model]  # Use model from arguments
         | 
| 484 | 
             
                    # debug_task_name = [args.task]  # Use task from arguments
         | 
| 485 | 
             
                    debug_model_names = args.model.split(",")
         | 
| @@ -510,25 +494,43 @@ if __name__ == "__main__": | |
| 510 | 
             
                                results = process_evaluation(task, eval_request, limit=args.limit)
         | 
| 511 | 
             
                                # except Exception as e:
         | 
| 512 | 
             
                                #     print(f"debug running error: {e}")
         | 
| 513 | 
            -
                 | 
|  | |
|  | |
| 514 | 
             
                    while True:
         | 
| 515 | 
             
                        res = False
         | 
| 516 | 
            -
             | 
| 517 | 
             
                        # if random.randint(0, 10) == 0:
         | 
| 518 | 
             
                        res = process_pending_requests()
         | 
| 519 | 
             
                        print(f"waiting for 60 seconds")
         | 
| 520 | 
             
                        time.sleep(60)
         | 
| 521 | 
            -
             | 
| 522 | 
             
                        # if res is False:
         | 
| 523 | 
             
                        #     if random.randint(0, 5) == 0:
         | 
| 524 | 
             
                        #         res = maybe_refresh_results(100)
         | 
| 525 | 
             
                        #     else:
         | 
| 526 | 
             
                        #         res = process_finished_requests(100)
         | 
| 527 | 
            -
             | 
| 528 | 
             
                        # time.sleep(60)
         | 
| 529 | 
            -
             | 
| 530 | 
             
                        # if res is False:
         | 
| 531 | 
             
                        #     if random.randint(0, 5) == 0:
         | 
| 532 | 
             
                        #         res = maybe_refresh_results(0)
         | 
| 533 | 
             
                        #     else:
         | 
| 534 | 
             
                        #         res = process_finished_requests(0)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 17 | 
             
            from src.leaderboard.read_evals import EvalResult
         | 
| 18 |  | 
| 19 | 
             
            from src.envs import QUEUE_REPO, RESULTS_REPO, API, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
         | 
| 20 | 
            +
            from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus, get_gpu_details
         | 
| 21 |  | 
| 22 | 
             
            from src.leaderboard.read_evals import get_raw_eval_results
         | 
| 23 |  | 
|  | |
| 142 | 
             
            def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
         | 
| 143 | 
             
                batch_size = 1
         | 
| 144 | 
             
                batch_size = eval_request.batch_size
         | 
|  | |
|  | |
|  | |
| 145 |  | 
| 146 | 
             
                init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
         | 
| 147 | 
             
                # if init_gpu_info['Mem(M)'] > 500:
         | 
|  | |
| 385 |  | 
| 386 | 
             
                return False
         | 
| 387 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 388 | 
             
            def process_pending_requests() -> bool:
         | 
|  | |
|  | |
|  | |
| 389 | 
             
                sanity_checks()
         | 
| 390 | 
             
                print("Processing pending requests")
         | 
| 391 | 
             
                current_pending_status = [PENDING_STATUS]
         | 
|  | |
| 455 | 
             
                parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
         | 
| 456 | 
             
                parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB", 
         | 
| 457 | 
             
                                    help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
         | 
| 458 | 
            +
                parser.add_argument("--debug_repo", action="store_true", help="Use debug repo")
         | 
| 459 | 
             
                return parser.parse_args()
         | 
| 460 |  | 
| 461 |  | 
|  | |
| 463 | 
             
                args = get_args()
         | 
| 464 | 
             
                local_debug = args.debug
         | 
| 465 | 
             
                # debug specific task by ping
         | 
| 466 | 
            +
                if local_debug and not args.debug_repo:
         | 
| 467 | 
             
                    # debug_model_names = [args.model]  # Use model from arguments
         | 
| 468 | 
             
                    # debug_task_name = [args.task]  # Use task from arguments
         | 
| 469 | 
             
                    debug_model_names = args.model.split(",")
         | 
|  | |
| 494 | 
             
                                results = process_evaluation(task, eval_request, limit=args.limit)
         | 
| 495 | 
             
                                # except Exception as e:
         | 
| 496 | 
             
                                #     print(f"debug running error: {e}")
         | 
| 497 | 
            +
                elif local_debug and args.debug_repo:
         | 
| 498 | 
            +
                    QUEUE_REPO = DEBUG_QUEUE_REPO
         | 
| 499 | 
            +
                    RESULTS_REPO = DEBUG_RESULTS_REPO
         | 
| 500 | 
             
                    while True:
         | 
| 501 | 
             
                        res = False
         | 
|  | |
| 502 | 
             
                        # if random.randint(0, 10) == 0:
         | 
| 503 | 
             
                        res = process_pending_requests()
         | 
| 504 | 
             
                        print(f"waiting for 60 seconds")
         | 
| 505 | 
             
                        time.sleep(60)
         | 
|  | |
| 506 | 
             
                        # if res is False:
         | 
| 507 | 
             
                        #     if random.randint(0, 5) == 0:
         | 
| 508 | 
             
                        #         res = maybe_refresh_results(100)
         | 
| 509 | 
             
                        #     else:
         | 
| 510 | 
             
                        #         res = process_finished_requests(100)
         | 
|  | |
| 511 | 
             
                        # time.sleep(60)
         | 
|  | |
| 512 | 
             
                        # if res is False:
         | 
| 513 | 
             
                        #     if random.randint(0, 5) == 0:
         | 
| 514 | 
             
                        #         res = maybe_refresh_results(0)
         | 
| 515 | 
             
                        #     else:
         | 
| 516 | 
             
                        #         res = process_finished_requests(0)
         | 
| 517 | 
            +
                elif not local_debug and not args.debug_repo:
         | 
| 518 | 
            +
                    while True:
         | 
| 519 | 
            +
                       res = False
         | 
| 520 | 
            +
                       # if random.randint(0, 10) == 0:
         | 
| 521 | 
            +
                       res = process_pending_requests()
         | 
| 522 | 
            +
                       print(f"waiting for 60 seconds")
         | 
| 523 | 
            +
                       time.sleep(60)
         | 
| 524 | 
            +
                       # if res is False:
         | 
| 525 | 
            +
                       #     if random.randint(0, 5) == 0:
         | 
| 526 | 
            +
                       #         res = maybe_refresh_results(100)
         | 
| 527 | 
            +
                       #     else:
         | 
| 528 | 
            +
                       #         res = process_finished_requests(100)
         | 
| 529 | 
            +
                       # time.sleep(60)
         | 
| 530 | 
            +
                       # if res is False:
         | 
| 531 | 
            +
                       #     if random.randint(0, 5) == 0:
         | 
| 532 | 
            +
                       #         res = maybe_refresh_results(0)
         | 
| 533 | 
            +
                       #     else:
         | 
| 534 | 
            +
                       #         res = process_finished_requests(0)
         | 
| 535 | 
            +
                else:
         | 
| 536 | 
            +
                    raise Exception("Cannot use debug_repo without local debug flag")
         | 
    	
        src/backend/envs.py
    CHANGED
    
    | @@ -57,7 +57,7 @@ class Tasks(Enum): | |
| 57 |  | 
| 58 | 
             
                # task20 = Task("race", "acc", "RACE", 0)
         | 
| 59 | 
             
                task21 = Task("mmlu", "acc", "MMLU", 5)
         | 
| 60 | 
            -
                task22 = Task("gsm8k", " | 
| 61 |  | 
| 62 |  | 
| 63 | 
             
            EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
         | 
|  | |
| 57 |  | 
| 58 | 
             
                # task20 = Task("race", "acc", "RACE", 0)
         | 
| 59 | 
             
                task21 = Task("mmlu", "acc", "MMLU", 5)
         | 
| 60 | 
            +
                task22 = Task("gsm8k", "em", "GSM8K", 5)
         | 
| 61 |  | 
| 62 |  | 
| 63 | 
             
            EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
         | 
    	
        src/display/utils.py
    CHANGED
    
    | @@ -75,6 +75,7 @@ class Tasks(Enum): | |
| 75 | 
             
                # # XXX include me back at some point
         | 
| 76 | 
             
                selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
         | 
| 77 | 
             
                mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
         | 
|  | |
| 78 |  | 
| 79 |  | 
| 80 | 
             
            # These classes are for user facing column names,
         | 
|  | |
| 75 | 
             
                # # XXX include me back at some point
         | 
| 76 | 
             
                selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
         | 
| 77 | 
             
                mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
         | 
| 78 | 
            +
                gsm8k = Task("gsm8k", "em", "GSM8K") #GSM8K/EM (5-shot)
         | 
| 79 |  | 
| 80 |  | 
| 81 | 
             
            # These classes are for user facing column names,
         | 
    	
        src/submission/check_validity.py
    CHANGED
    
    | @@ -130,7 +130,8 @@ def already_submitted_models(requested_models_dir: str) -> set[str]: | |
| 130 | 
             
                                continue
         | 
| 131 | 
             
                            with open(os.path.join(root, file), "r") as f:
         | 
| 132 | 
             
                                info = json.load(f)
         | 
| 133 | 
            -
                                 | 
|  | |
| 134 |  | 
| 135 | 
             
                                # Select organisation
         | 
| 136 | 
             
                                if info["model"].count("/") == 0 or "submitted_time" not in info:
         | 
|  | |
| 130 | 
             
                                continue
         | 
| 131 | 
             
                            with open(os.path.join(root, file), "r") as f:
         | 
| 132 | 
             
                                info = json.load(f)
         | 
| 133 | 
            +
                                if not info["status"] == "FINISHED" and not info["status"] == "RUNNING":
         | 
| 134 | 
            +
                                    file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}_{info['gpu_type']}")
         | 
| 135 |  | 
| 136 | 
             
                                # Select organisation
         | 
| 137 | 
             
                                if info["model"].count("/") == 0 or "submitted_time" not in info:
         | 
    	
        src/utils.py
    CHANGED
    
    | @@ -3,12 +3,48 @@ from huggingface_hub import snapshot_download | |
| 3 | 
             
            import subprocess
         | 
| 4 | 
             
            import re
         | 
| 5 | 
             
            import os
         | 
|  | |
| 6 |  | 
| 7 | 
             
            try:
         | 
| 8 | 
             
                from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
         | 
| 9 | 
             
            except:
         | 
| 10 | 
             
                print("local debug: from display.utils")
         | 
| 11 | 
             
                from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 12 |  | 
| 13 | 
             
            def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
         | 
| 14 | 
             
                for i in range(10):
         | 
| @@ -56,7 +92,7 @@ def parse_nvidia_smi(): | |
| 56 | 
             
                gpu_stats = []
         | 
| 57 |  | 
| 58 | 
             
                gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
         | 
| 59 | 
            -
                gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s] | 
| 60 |  | 
| 61 | 
             
                gpu_name = ""
         | 
| 62 | 
             
                for index in gpu_indices:
         | 
| @@ -131,5 +167,70 @@ def analyze_gpu_stats(stats_list): | |
| 131 |  | 
| 132 | 
             
                return avg_stats
         | 
| 133 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 134 | 
             
            if __name__ == "__main__":
         | 
| 135 | 
             
                print(analyze_gpu_stats(parse_nvidia_smi()))
         | 
|  | |
| 3 | 
             
            import subprocess
         | 
| 4 | 
             
            import re
         | 
| 5 | 
             
            import os
         | 
| 6 | 
            +
            import GPUtil
         | 
| 7 |  | 
| 8 | 
             
            try:
         | 
| 9 | 
             
                from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
         | 
| 10 | 
             
            except:
         | 
| 11 | 
             
                print("local debug: from display.utils")
         | 
| 12 | 
             
                from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
         | 
| 13 | 
            +
                
         | 
| 14 | 
            +
            MEM_BW_DICT ={
         | 
| 15 | 
            +
                "NVIDIA-A100-PCIe-80GB": 1935,
         | 
| 16 | 
            +
                "NVIDIA-A100-SXM-80GB": 2039,
         | 
| 17 | 
            +
                "NVIDIA-H100-PCIe-80GB": 2039,
         | 
| 18 | 
            +
                "NVIDIA-RTX-A5000-24GB": 768
         | 
| 19 | 
            +
            }
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            PEAK_FLOPS_DICT = {
         | 
| 22 | 
            +
                "float32":{
         | 
| 23 | 
            +
                    "NVIDIA-A100-PCIe-80GB": 312e12,
         | 
| 24 | 
            +
                    "NVIDIA-A100-SXM-80GB": 312e12,
         | 
| 25 | 
            +
                    "NVIDIA-H100-PCIe-80GB": 756e12,
         | 
| 26 | 
            +
                    "NVIDIA-RTX-A5000-24GB": 222.2e12
         | 
| 27 | 
            +
                },
         | 
| 28 | 
            +
                "float16":{
         | 
| 29 | 
            +
                    "NVIDIA-A100-PCIe-80GB": 624e12,
         | 
| 30 | 
            +
                    "NVIDIA-A100-SXM-80GB": 624e12,
         | 
| 31 | 
            +
                    "NVIDIA-H100-PCIe-80GB": 1513e12,
         | 
| 32 | 
            +
                    "NVIDIA-RTX-A5000-24GB": 444.4e12
         | 
| 33 | 
            +
                },
         | 
| 34 | 
            +
                "8bit":{
         | 
| 35 | 
            +
                    "NVIDIA-A100-PCIe-80GB": 1248e12,
         | 
| 36 | 
            +
                    "NVIDIA-A100-SXM-80GB": 1248e12,
         | 
| 37 | 
            +
                    "NVIDIA-H100-PCIe-80GB": 3026e12,
         | 
| 38 | 
            +
                    "NVIDIA-RTX-A5000-24GB": 889e12
         | 
| 39 | 
            +
                },
         | 
| 40 | 
            +
                "4bit": {
         | 
| 41 | 
            +
                    "NVIDIA-A100-PCIe-80GB": 2496e12,
         | 
| 42 | 
            +
                    "NVIDIA-A100-SXM-80GB": 2496e12,
         | 
| 43 | 
            +
                    "NVIDIA-H100-PCIe-80GB": 6052e12,
         | 
| 44 | 
            +
                    "NVIDIA-RTX-A5000-24GB": 1778e12
         | 
| 45 | 
            +
                }
         | 
| 46 | 
            +
             | 
| 47 | 
            +
            }
         | 
| 48 |  | 
| 49 | 
             
            def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
         | 
| 50 | 
             
                for i in range(10):
         | 
|  | |
| 92 | 
             
                gpu_stats = []
         | 
| 93 |  | 
| 94 | 
             
                gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
         | 
| 95 | 
            +
                gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
         | 
| 96 |  | 
| 97 | 
             
                gpu_name = ""
         | 
| 98 | 
             
                for index in gpu_indices:
         | 
|  | |
| 167 |  | 
| 168 | 
             
                return avg_stats
         | 
| 169 |  | 
| 170 | 
            +
            def get_gpu_number():
         | 
| 171 | 
            +
                visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
         | 
| 172 | 
            +
                if visible_devices is not None:
         | 
| 173 | 
            +
                    gpu_indices = visible_devices.split(',')
         | 
| 174 | 
            +
                else:
         | 
| 175 | 
            +
                    # Query all GPU indices if CUDA_VISIBLE_DEVICES is not set
         | 
| 176 | 
            +
                    result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True)
         | 
| 177 | 
            +
                    if result.returncode != 0:
         | 
| 178 | 
            +
                        print("Failed to query GPU indices.")
         | 
| 179 | 
            +
                        return []
         | 
| 180 | 
            +
                    gpu_indices = result.stdout.strip().split('\n')
         | 
| 181 | 
            +
                # print(f"gpu_indices: {gpu_indices}")
         | 
| 182 | 
            +
                gpu_stats = []
         | 
| 183 | 
            +
             | 
| 184 | 
            +
                gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
         | 
| 185 | 
            +
             | 
| 186 | 
            +
                for index in gpu_indices:
         | 
| 187 | 
            +
                    result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True)
         | 
| 188 | 
            +
                    output = result.stdout.strip()
         | 
| 189 | 
            +
                    lines = output.split("\n")
         | 
| 190 | 
            +
                    for line in lines:
         | 
| 191 | 
            +
                        match = gpu_info_pattern.search(line)
         | 
| 192 | 
            +
                        gpu_info = {}
         | 
| 193 | 
            +
                        if match:
         | 
| 194 | 
            +
                            temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
         | 
| 195 | 
            +
                            gpu_info.update({
         | 
| 196 | 
            +
                                GPU_TEMP: temp,
         | 
| 197 | 
            +
                                GPU_Power: power_usage,
         | 
| 198 | 
            +
                                GPU_Mem: round(mem_usage / 1024, 2),
         | 
| 199 | 
            +
                                GPU_Util: gpu_util
         | 
| 200 | 
            +
                            })
         | 
| 201 | 
            +
             | 
| 202 | 
            +
                        if len(gpu_info) >= 4:
         | 
| 203 | 
            +
                            gpu_stats.append(gpu_info)
         | 
| 204 | 
            +
                            
         | 
| 205 | 
            +
                return len(gpu_stats)
         | 
| 206 | 
            +
             | 
| 207 | 
            +
            def get_gpu_details():
         | 
| 208 | 
            +
                gpus = GPUtil.getGPUs()
         | 
| 209 | 
            +
                gpu = gpus[0]
         | 
| 210 | 
            +
                name = gpu.name.replace(" ", "-")
         | 
| 211 | 
            +
                # Convert memory from MB to GB and round to nearest whole number
         | 
| 212 | 
            +
                memory_gb = round(gpu.memoryTotal / 1024)
         | 
| 213 | 
            +
                memory = f"{memory_gb}GB"
         | 
| 214 | 
            +
                formatted_name = f"{name}-{memory}"
         | 
| 215 | 
            +
                return formatted_name
         | 
| 216 | 
            +
             | 
| 217 | 
            +
            def get_peak_bw(gpu_name):
         | 
| 218 | 
            +
                return MEM_BW_DICT[gpu_name]
         | 
| 219 | 
            +
             | 
| 220 | 
            +
            def get_peak_flops(gpu_name, precision):
         | 
| 221 | 
            +
                return PEAK_FLOPS_DICT[precision][gpu_name]
         | 
| 222 | 
            +
             | 
| 223 | 
            +
            def transfer_precision2bytes(precision):
         | 
| 224 | 
            +
                if precision == "float32":
         | 
| 225 | 
            +
                    return 4
         | 
| 226 | 
            +
                elif precision == "float16":
         | 
| 227 | 
            +
                    return 2
         | 
| 228 | 
            +
                elif precision == "8bit":
         | 
| 229 | 
            +
                    return 1
         | 
| 230 | 
            +
                elif precision == "4bit":
         | 
| 231 | 
            +
                    return 0.5
         | 
| 232 | 
            +
                else:
         | 
| 233 | 
            +
                    raise ValueError(f"Unsupported precision: {precision}")
         | 
| 234 | 
            +
             | 
| 235 | 
             
            if __name__ == "__main__":
         | 
| 236 | 
             
                print(analyze_gpu_stats(parse_nvidia_smi()))
         |