We express our sincere gratitude to NetMind.AI for their generous donation of GPUs, which plays a crucial role in ensuring the continuous operation of our Leaderboard.
+{error}
" + + +def styled_warning(warn): + return f"{warn}
" + + +def styled_message(message): + return f"{message}
" + + +def has_no_nan_values(df, columns): + return df[columns].notna().all(axis=1) + + +def has_nan_values(df, columns): + return df[columns].isna().any(axis=1) diff --git a/open-moe-llm-leaderboard-gh/src/display/imgs/Netmind.AI_LOGO.jpg b/open-moe-llm-leaderboard-gh/src/display/imgs/Netmind.AI_LOGO.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6ccff65e32b3fa0545a66ee3937df979ed542891 Binary files /dev/null and b/open-moe-llm-leaderboard-gh/src/display/imgs/Netmind.AI_LOGO.jpg differ diff --git a/open-moe-llm-leaderboard-gh/src/display/utils.py b/open-moe-llm-leaderboard-gh/src/display/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..98188b5b94fbae0ba6f856711cdbb42e3b2e821c --- /dev/null +++ b/open-moe-llm-leaderboard-gh/src/display/utils.py @@ -0,0 +1,265 @@ +from dataclasses import dataclass, make_dataclass +from enum import Enum + +import pandas as pd + + +def fields(raw_class): + return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"] + +E2Es = "E2E(s)" #"End-to-end time (s)" +PREs = "PRE(s)" #"Prefilling time (s)" +TS = "T/s" #Decoding throughput (tok/s) +InFrame = "Method" #"Inference framework" +MULTIPLE_CHOICEs = ["mmlu"] + +GPU_TEMP = 'Temp(C)' +GPU_Power = 'Power(W)' +GPU_Mem = 'Mem(G)' +GPU_Name = "GPU" +GPU_Util = 'Util(%)' +BATCH_SIZE = 'bs' +PRECISION = "Precision" +system_metrics_to_name_map = { + "end_to_end_time": f"{E2Es}", + "prefilling_time": f"{PREs}", + "decoding_throughput": f"{TS}", +} + +gpu_metrics_to_name_map = { + GPU_Util: GPU_Util, + GPU_TEMP: GPU_TEMP, + GPU_Power: GPU_Power, + GPU_Mem: GPU_Mem, + "batch_size": BATCH_SIZE, + "precision": PRECISION, + GPU_Name: GPU_Name, +} + +@dataclass +class Task: + benchmark: str + metric: str + col_name: str + + +class Tasks(Enum): + # XXX include me back at some point + # nqopen = Task("nq8", "em", "NQ Open/EM") + # triviaqa = Task("tqa8", "em", "TriviaQA/EM") + + # truthfulqa_mc1 = Task("truthfulqa_mc1", "acc", "TruthQA MC1/Acc") + # truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthQA MC2/Acc") + # truthfulqa_gen = Task("truthfulqa_gen", "rougeL_acc", "TruthQA Gen/ROUGE") + + # xsum_r = Task("xsum_v2", "rougeL", "XSum/ROUGE") + # xsum_f = Task("xsum_v2", "factKB", "XSum/factKB") + # xsum_b = Task("xsum_v2", "bertscore_precision", "XSum/BERT-P") + + # cnndm_r = Task("cnndm_v2", "rougeL", "CNN-DM/ROUGE") + # cnndm_f = Task("cnndm_v2", "factKB", "CNN-DM/factKB") + # cnndm_b = Task("cnndm_v2", "bertscore_precision", "CNN-DM/BERT-P") + + # race = Task("race", "acc", "RACE/Acc") + # squadv2 = Task("squadv2", "exact", "SQUaDv2/EM") + + # memotrap = Task("memo-trap_v2", "acc", "MemoTrap/Acc") + # ifeval = Task("ifeval", "prompt_level_strict_acc", "IFEval/Acc") + + # faithdial = Task("faithdial_hallu_v2", "acc", "FaithDial/Acc") + + # halueval_qa = Task("halueval_qa", "acc", "HaluQA/Acc") + # halueval_summ = Task("halueval_summarization", "acc", "HaluSumm/Acc") + # halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc") + + # # XXX include me back at some point + selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT") + mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot) + gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (8-shot) + + +# These classes are for user facing column names, +# to avoid having to change them all around the code +# when a modif is needed +@dataclass +class ColumnContent: + name: str + type: str + displayed_by_default: bool + hidden: bool = False + never_hidden: bool = False + dummy: bool = False + + +auto_eval_column_dict = [] +# Init +auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)]) +auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)]) + +# #Scores +# # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)]) + +# Inference framework +auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent(f"{InFrame}", "str", True)]) + +for task in Tasks: + auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]) + # System performance metrics + auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)]) + auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)]) + # auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)]) + auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)]) + auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)]) + auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)]) + if task.value.benchmark in MULTIPLE_CHOICEs: + continue + # auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)]) + auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True, hidden=True)]) + + +# Model information +auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)]) +auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)]) +auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)]) +auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)]) +auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)]) +auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)]) +auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)]) +auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)]) +auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)]) +# Dummy column for the search bar (hidden by the custom CSS) +auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)]) + +# We use make dataclass to dynamically fill the scores from Tasks +AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True) + + +@dataclass(frozen=True) +class EvalQueueColumn: # Queue column + model = ColumnContent("model", "markdown", True) + revision = ColumnContent("revision", "str", True) + private = ColumnContent("private", "bool", True) + precision = ColumnContent("precision", "str", True) + weight_type = ColumnContent("weight_type", "str", "Original") + model_framework = ColumnContent("inference_framework", "str", True) + status = ColumnContent("status", "str", True) + + +@dataclass +class ModelDetails: + name: str + symbol: str = "" # emoji, only for the model type + + +class ModelType(Enum): + PT = ModelDetails(name="pretrained", symbol="🟢") + FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="🔶") + chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬") + merges = ModelDetails(name="base merges and moerges", symbol="🤝") + Unknown = ModelDetails(name="", symbol="?") + + def to_str(self, separator=" "): + return f"{self.value.symbol}{separator}{self.value.name}" + + @staticmethod + def from_str(type): + if "fine-tuned" in type or "🔶" in type: + return ModelType.FT + if "pretrained" in type or "🟢" in type: + return ModelType.PT + if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]): + return ModelType.chat + if "merge" in type or "🤝" in type: + return ModelType.merges + return ModelType.Unknown + + +class InferenceFramework(Enum): + # "moe-infinity", hf-chat + MoE_Infinity = ModelDetails("moe-infinity") + HF_Chat = ModelDetails("hf-chat") + Unknown = ModelDetails("?") + + def to_str(self): + return self.value.name + + @staticmethod + def from_str(inference_framework: str): + if inference_framework in ["moe-infinity"]: + return InferenceFramework.MoE_Infinity + if inference_framework in ["hf-chat"]: + return InferenceFramework.HF_Chat + return InferenceFramework.Unknown + +class GPUType(Enum): + H100_pcie = ModelDetails("NVIDIA-H100-PCIe-80GB") + A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB") + A5000 = ModelDetails("NVIDIA-RTX-A5000-24GB") + Unknown = ModelDetails("?") + + def to_str(self): + return self.value.name + + @staticmethod + def from_str(gpu_type: str): + if gpu_type in ["NVIDIA-H100-PCIe-80GB"]: + return GPUType.A100_pcie + if gpu_type in ["NVIDIA-A100-PCIe-80GB"]: + return GPUType.H100_pcie + if gpu_type in ["NVIDIA-A5000-24GB"]: + return GPUType.A5000 + return GPUType.Unknown + +class WeightType(Enum): + Adapter = ModelDetails("Adapter") + Original = ModelDetails("Original") + Delta = ModelDetails("Delta") + + +class Precision(Enum): + float32 = ModelDetails("float32") + float16 = ModelDetails("float16") + bfloat16 = ModelDetails("bfloat16") + qt_8bit = ModelDetails("8bit") + qt_4bit = ModelDetails("4bit") + qt_GPTQ = ModelDetails("GPTQ") + Unknown = ModelDetails("?") + + @staticmethod + def from_str(precision: str): + if precision in ["torch.float32", "float32"]: + return Precision.float32 + if precision in ["torch.float16", "float16"]: + return Precision.float16 + if precision in ["torch.bfloat16", "bfloat16"]: + return Precision.bfloat16 + if precision in ["8bit"]: + return Precision.qt_8bit + if precision in ["4bit"]: + return Precision.qt_4bit + if precision in ["GPTQ", "None"]: + return Precision.qt_GPTQ + return Precision.Unknown + + +# Column selection +COLS = [c.name for c in fields(AutoEvalColumn)] +TYPES = [c.type for c in fields(AutoEvalColumn)] +COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden] +TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden] + +EVAL_COLS = [c.name for c in fields(EvalQueueColumn)] +EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)] + +BENCHMARK_COLS = [t.value.col_name for t in Tasks] + +# NUMERIC_INTERVALS = { +# "?": pd.Interval(-1, 0, closed="right"), +# "~1.5": pd.Interval(0, 2, closed="right"), +# "~3": pd.Interval(2, 4, closed="right"), +# "~7": pd.Interval(4, 9, closed="right"), +# "~13": pd.Interval(9, 20, closed="right"), +# "~35": pd.Interval(20, 45, closed="right"), +# "~60": pd.Interval(45, 70, closed="right"), +# "70+": pd.Interval(70, 10000, closed="right"), +# } diff --git a/open-moe-llm-leaderboard-gh/src/envs.py b/open-moe-llm-leaderboard-gh/src/envs.py new file mode 100644 index 0000000000000000000000000000000000000000..0ee354bb13392b1c1a3abc26343ee8401b7239f0 --- /dev/null +++ b/open-moe-llm-leaderboard-gh/src/envs.py @@ -0,0 +1,36 @@ +import os + +from huggingface_hub import HfApi + +# clone / pull the lmeh eval data +H4_TOKEN = os.environ.get("H4_TOKEN", None) + +# REPO_ID = "pminervini/sparse-generative-ai" +REPO_ID = "sparse-generative-ai/open-moe-llm-leaderboard" + +QUEUE_REPO = "sparse-generative-ai/requests" +QUEUE_REPO_OPEN_LLM = "open-llm-leaderboard/requests" +RESULTS_REPO = "sparse-generative-ai/results" + +DEBUG_QUEUE_REPO = "sparse-generative-ai/debug_requests" +DEBUG_RESULTS_REPO = "sparse-generative-ai/debug_results" + +IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True)) + +CACHE_PATH = os.getenv("HF_HOME", ".") + +EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue") +EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results") +EVAL_REQUESTS_PATH_OPEN_LLM = os.path.join(CACHE_PATH, "eval-queue-open-llm") + +EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private" +EVAL_RESULTS_PATH_PRIVATE = "eval-results-private" + +PATH_TO_COLLECTION = "sparse-generative-ai/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03" + +# Rate limit variables +RATE_LIMIT_PERIOD = 7 +RATE_LIMIT_QUOTA = 5 +HAS_HIGHER_RATE_LIMIT = ["TheBloke"] + +API = HfApi(token=H4_TOKEN) diff --git a/open-moe-llm-leaderboard-gh/src/leaderboard/filter_models.py b/open-moe-llm-leaderboard-gh/src/leaderboard/filter_models.py new file mode 100644 index 0000000000000000000000000000000000000000..efbe83cf4d7203fca388b7afd1d801bd00dfc626 --- /dev/null +++ b/open-moe-llm-leaderboard-gh/src/leaderboard/filter_models.py @@ -0,0 +1,50 @@ +from src.display.formatting import model_hyperlink +from src.display.utils import AutoEvalColumn + +# Models which have been flagged by users as being problematic for a reason or another +# (Model name to forum discussion link) +FLAGGED_MODELS = { + "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202", + "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207", + "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213", + "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/236", + "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/237", + "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/215", + "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287", + "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287", + "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287", +} + +# Models which have been requested by orgs to not be submitted on the leaderboard +DO_NOT_SUBMIT_MODELS = [ + "Voicelab/trurl-2-13b", # trained on MMLU +] + + +def flag_models(leaderboard_data: list[dict]): + for model_data in leaderboard_data: + if model_data["model_name_for_query"] in FLAGGED_MODELS: + issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1] + issue_link = model_hyperlink( + FLAGGED_MODELS[model_data["model_name_for_query"]], + f"See discussion #{issue_num}", + ) + model_data[AutoEvalColumn.model.name] = ( + f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}" + ) + + +def remove_forbidden_models(leaderboard_data: list[dict]): + indices_to_remove = [] + for ix, model in enumerate(leaderboard_data): + if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS: + indices_to_remove.append(ix) + + for ix in reversed(indices_to_remove): + leaderboard_data.pop(ix) + return leaderboard_data + + +def filter_models(leaderboard_data: list[dict]): + leaderboard_data = remove_forbidden_models(leaderboard_data) + flag_models(leaderboard_data) diff --git a/open-moe-llm-leaderboard-gh/src/leaderboard/read_evals.py b/open-moe-llm-leaderboard-gh/src/leaderboard/read_evals.py new file mode 100644 index 0000000000000000000000000000000000000000..bd75bb4d916a9843e6f1670850d827734e91f945 --- /dev/null +++ b/open-moe-llm-leaderboard-gh/src/leaderboard/read_evals.py @@ -0,0 +1,290 @@ +import glob +import json +import os +from tqdm import tqdm +from dataclasses import dataclass + +import dateutil + +# import numpy as np + +from src.display.formatting import make_clickable_model +from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType +from src.submission.check_validity import is_model_on_hub + +from typing import Optional + + +def is_float(string): + try: + float(string) + return True + except ValueError: + return False + + +@dataclass +class EvalResult: + # Also see src.display.utils.AutoEvalColumn for what will be displayed. + eval_name: str # org_model_precision (uid) + full_model: str # org/model (path on hub) + org: str + model: str + revision: str # commit hash, "" if main + results: dict + precision: Precision = Precision.Unknown + model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ... + weight_type: WeightType = WeightType.Original # Original or Adapter + architecture: str = "Unknown" # From config file + license: str = "?" + likes: int = 0 + num_params: int = 0 + date: str = "" # submission date of request file + still_on_hub: bool = False + inference_framework: str = "Unknown" + + @staticmethod + def init_from_json_file(json_filepath, is_backend: bool = False): + """Inits the result from the specific model result file""" + with open(json_filepath) as fp: + data = json.load(fp) + + # We manage the legacy config format + config = data.get("config", data.get("config_general", None)) + + # Precision + precision = Precision.from_str(config.get("model_dtype")) + + # Get model and org + org_and_model = config.get("model_name", config.get("model_args", None)) + org_and_model = org_and_model.split("/", 1) + + # Get inference framework + inference_framework = config.get("inference_framework", "Unknown") + + if len(org_and_model) == 1: + org = None + model = org_and_model[0] + result_key = f"{model}_{precision.value.name}" + else: + org = org_and_model[0] + model = org_and_model[1] + result_key = f"{org}_{model}_{precision.value.name}" + full_model = "/".join(org_and_model) + + still_on_hub, error, model_config = is_model_on_hub( + full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False + ) + architecture = "?" + if model_config is not None: + architectures = getattr(model_config, "architectures", None) + if architectures: + architecture = ";".join(architectures) + + # Extract results available in this file (some results are split in several files) + + # data['results'] is {'nq_open': {'em': 0.24293628808864265, 'em_stderr': 0.007138697341112125}} + + results = {} + for benchmark, benchmark_results in data["results"].items(): + if benchmark not in results: + results[benchmark] = {} + + for metric, value in benchmark_results.items(): + to_add = True + if "_stderr" in metric: + to_add = False + if "alias" in metric: + to_add = False + + if "," in metric: + metric = metric.split(",")[0] + metric = metric.replace("exact_match", "em") + + if to_add is True: + multiplier = 100.0 + if "GPU" in metric: + results[benchmark][metric] = value + continue + if "precision" in metric: + results[benchmark][metric] = value + continue + + if "rouge" in metric and "truthful" not in benchmark: + multiplier = 1.0 + if "squad" in benchmark: + multiplier = 1.0 + if "time" in metric: + multiplier = 1.0 + if "throughput" in metric: + multiplier = 1.0 + if "batch_" in metric or "Mem" in metric or "Util" in metric: + multiplier = 1 + + + # print('RESULTS', data['results']) + # print('XXX', benchmark, metric, value, multiplier) + results[benchmark][metric] = value * multiplier + + res = EvalResult( + eval_name=result_key, + full_model=full_model, + org=org, + model=model, + results=results, + precision=precision, + revision=config.get("model_sha", ""), + still_on_hub=still_on_hub, + architecture=architecture, + inference_framework=inference_framework, + ) + + return res + + def update_with_request_file(self, requests_path): + """Finds the relevant request file for the current model and updates info with it""" + request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name) + + try: + with open(request_file, "r") as f: + request = json.load(f) + + self.model_type = ModelType.from_str(request.get("model_type", "")) + self.weight_type = WeightType[request.get("weight_type", "Original")] + self.license = request.get("license", "?") + self.likes = request.get("likes", 0) + self.num_params = request.get("params", 0) + self.date = request.get("submitted_time", "") + self.inference_framework = request.get("inference_framework", "Unknown") + except Exception as e: + print(f"Could not find request file for {self.org}/{self.model} -- path: {requests_path} -- {e}") + + def is_complete(self) -> bool: + for task in Tasks: + if task.value.benchmark not in self.results: + return False + return True + + def to_dict(self): + """Converts the Eval Result to a dict compatible with our dataframe display""" + + # breakpoint() + # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks) + + data_dict = { + "eval_name": self.eval_name, # not a column, just a save name, + AutoEvalColumn.precision.name: self.precision.value.name, + AutoEvalColumn.model_type.name: self.model_type.value.name, + AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol, + AutoEvalColumn.weight_type.name: self.weight_type.value.name, + AutoEvalColumn.architecture.name: self.architecture, + AutoEvalColumn.model.name: make_clickable_model(self.full_model), + AutoEvalColumn.dummy.name: self.full_model, + AutoEvalColumn.revision.name: self.revision, + # AutoEvalColumn.average.name: average, + AutoEvalColumn.license.name: self.license, + AutoEvalColumn.likes.name: self.likes, + AutoEvalColumn.params.name: self.num_params, + AutoEvalColumn.still_on_hub.name: self.still_on_hub, + AutoEvalColumn.inference_framework.name: self.inference_framework, + } + + for task in Tasks: + if task.value.benchmark in self.results: + data_dict[task.value.col_name] = self.results[task.value.benchmark] + + return data_dict + + +def get_request_file_for_model(requests_path, model_name, precision): + """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED and RUNNING""" + request_files = os.path.join( + requests_path, + f"{model_name}_eval_request_*.json", + ) + request_files = glob.glob(request_files) + + # Select correct request file (precision) + request_file = "" + request_files = sorted(request_files, reverse=True) + + for tmp_request_file in request_files: + with open(tmp_request_file, "r") as f: + req_content = json.load(f) + if req_content["precision"] == precision.split(".")[-1]: + request_file = tmp_request_file + return request_file + + +def get_request_file_for_model_open_llm(requests_path, model_name, precision): + """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED""" + request_files = os.path.join( + requests_path, + f"{model_name}_eval_request_*.json", + ) + request_files = glob.glob(request_files) + + # Select correct request file (precision) + request_file = "" + request_files = sorted(request_files, reverse=True) + for tmp_request_file in request_files: + with open(tmp_request_file, "r") as f: + req_content = json.load(f) + if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]: + request_file = tmp_request_file + return request_file + + +def update_model_type_with_open_llm_request_file(result, open_llm_requests_path): + """Finds the relevant request file for the current model and updates info with it""" + request_file = get_request_file_for_model_open_llm( + open_llm_requests_path, result.full_model, result.precision.value.name + ) + + if request_file: + try: + with open(request_file, "r") as f: + request = json.load(f) + open_llm_model_type = request.get("model_type", "Unknown") + if open_llm_model_type != "Unknown": + result.model_type = ModelType.from_str(open_llm_model_type) + except Exception as e: + pass + return result + + +def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool = False) -> list[EvalResult]: + """From the path of the results folder root, extract all needed info for results""" + model_result_filepaths = [] + + for root, _, files in os.walk(results_path): + # We should only have json files in model results + if len(files) == 0 or any([not f.endswith(".json") for f in files]): + continue + + # Sort the files by date + try: + files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7]) + except dateutil.parser._parser.ParserError: + files = [files[-1]] + + for file in files: + model_result_filepaths.append(os.path.join(root, file)) + + eval_results = {} + for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"): + # Creation of result + eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend) + eval_result.update_with_request_file(requests_path) + # Store results of same eval together + eval_name = eval_result.eval_name + if eval_name in eval_results.keys(): + eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None}) + else: + eval_results[eval_name] = eval_result + + results = [] + for v in eval_results.values(): + results.append(v) + + return results diff --git a/open-moe-llm-leaderboard-gh/src/populate.py b/open-moe-llm-leaderboard-gh/src/populate.py new file mode 100644 index 0000000000000000000000000000000000000000..9d003dd07edf0590f4f84844e73743bcb67c0a19 --- /dev/null +++ b/open-moe-llm-leaderboard-gh/src/populate.py @@ -0,0 +1,120 @@ +import json +import os +from tqdm import tqdm +import copy +import pandas as pd +import numpy as np + +from src.display.formatting import has_no_nan_values, make_clickable_model +from src.display.utils import AutoEvalColumn, EvalQueueColumn +from src.leaderboard.filter_models import filter_models +from src.leaderboard.read_evals import get_raw_eval_results, EvalResult, update_model_type_with_open_llm_request_file + +from src.backend.envs import Tasks as BackendTasks +from src.display.utils import Tasks +from src.display.utils import system_metrics_to_name_map, gpu_metrics_to_name_map + +def get_leaderboard_df( + results_path: str, + requests_path: str, + requests_path_open_llm: str, + cols: list, + benchmark_cols: list, + is_backend: bool = False, +) -> tuple[list[EvalResult], pd.DataFrame]: + # Returns a list of EvalResult + raw_data: list[EvalResult] = get_raw_eval_results(results_path, requests_path, requests_path_open_llm) + if requests_path_open_llm != "": + for result_idx in tqdm(range(len(raw_data)), desc="updating model type with open llm leaderboard"): + raw_data[result_idx] = update_model_type_with_open_llm_request_file( + raw_data[result_idx], requests_path_open_llm + ) + + # all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()] + all_data_json_ = [v.to_dict() for v in raw_data] # include incomplete evals + + name_to_bm_map = {} + + task_iterator = Tasks + if is_backend is True: + task_iterator = BackendTasks + + for task in task_iterator: + task = task.value + name = task.col_name + bm = (task.benchmark, task.metric) + name_to_bm_map[name] = bm + + + + all_data_json = [] + for entry in all_data_json_: + new_entry = copy.deepcopy(entry) + for k, v in entry.items(): + if k in name_to_bm_map: + benchmark, metric = name_to_bm_map[k] + new_entry[k] = entry[k][metric] + for sys_metric, metric_namne in system_metrics_to_name_map.items(): + if sys_metric in entry[k]: + new_entry[f"{k} {metric_namne}"] = entry[k][sys_metric] + + for gpu_metric, metric_namne in gpu_metrics_to_name_map.items(): + if gpu_metric in entry[k]: + new_entry[f"{k} {metric_namne}"] = entry[k][gpu_metric] + all_data_json += [new_entry] + + # all_data_json.append(baseline_row) + filter_models(all_data_json) + + df = pd.DataFrame.from_records(all_data_json) + + # if AutoEvalColumn.average.name in df: + # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False) + for col in cols: + if col not in df.columns: + df[col] = np.nan + + if not df.empty: + df = df.round(decimals=2) + + # filter out if any of the benchmarks have not been produced + # df = df[has_no_nan_values(df, benchmark_cols)] + + return raw_data, df + + +def get_evaluation_queue_df(save_path: str, cols: list) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")] + all_evals = [] + + for entry in entries: + if ".json" in entry: + file_path = os.path.join(save_path, entry) + with open(file_path) as fp: + data = json.load(fp) + + data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) + data[EvalQueueColumn.revision.name] = data.get("revision", "main") + data[EvalQueueColumn.model_framework.name] = data.get("inference_framework", "-") + + all_evals.append(data) + elif ".md" not in entry: + # this is a folder + sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")] + for sub_entry in sub_entries: + file_path = os.path.join(save_path, entry, sub_entry) + with open(file_path) as fp: + data = json.load(fp) + + data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) + data[EvalQueueColumn.revision.name] = data.get("revision", "main") + data[EvalQueueColumn.model_framework.name] = data.get("inference_framework", "-") + all_evals.append(data) + + pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]] + running_list = [e for e in all_evals if e["status"] == "RUNNING"] + finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"] + df_pending = pd.DataFrame.from_records(pending_list, columns=cols) + df_running = pd.DataFrame.from_records(running_list, columns=cols) + df_finished = pd.DataFrame.from_records(finished_list, columns=cols) + return df_finished[cols], df_running[cols], df_pending[cols] diff --git a/open-moe-llm-leaderboard-gh/src/submission/check_validity.py b/open-moe-llm-leaderboard-gh/src/submission/check_validity.py new file mode 100644 index 0000000000000000000000000000000000000000..9c64c8e470460e5e00ba28219d8ff3b0de4ffdf0 --- /dev/null +++ b/open-moe-llm-leaderboard-gh/src/submission/check_validity.py @@ -0,0 +1,142 @@ +import json +import os +import re +from collections import defaultdict +from datetime import datetime, timedelta, timezone + +import huggingface_hub +from huggingface_hub import ModelCard +from huggingface_hub.hf_api import ModelInfo + +from transformers import AutoConfig, AutoTokenizer +from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config + +from src.envs import HAS_HIGHER_RATE_LIMIT + +from typing import Optional + + +# ht to @Wauplin, thank you for the snippet! +# See https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/317 +def check_model_card(repo_id: str) -> tuple[bool, str]: + # Returns operation status, and error message + try: + card = ModelCard.load(repo_id) + except huggingface_hub.utils.EntryNotFoundError: + return False, "Please add a model card to your model to explain how you trained/fine-tuned it." + + # Enforce license metadata + if card.data.license is None: + if not ("license_name" in card.data and "license_link" in card.data): + return False, ( + "License not found. Please add a license to your model card using the `license` metadata or a" + " `license_name`/`license_link` pair." + ) + + # Enforce card content + if len(card.text) < 200: + return False, "Please add a description to your model card, it is too short." + + return True, "" + + +def is_model_on_hub( + model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False +) -> tuple[bool, Optional[str], Optional[AutoConfig]]: + try: + config = AutoConfig.from_pretrained( + model_name, revision=revision, trust_remote_code=trust_remote_code, token=token + ) + if test_tokenizer: + try: + AutoTokenizer.from_pretrained( + model_name, revision=revision, trust_remote_code=trust_remote_code, token=token + ) + except ValueError as e: + return False, f"uses a tokenizer which is not in a transformers release: {e}", None + except Exception as e: + return ( + False, + "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", + None, + ) + return True, None, config + + except ValueError as e: + return ( + False, + "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.", + None, + ) + + except Exception as e: + return False, f"was not found on hub -- {str(e)}", None + + +def get_model_size(model_info: ModelInfo, precision: str): + size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)") + try: + model_size = round(model_info.safetensors["total"] / 1e9, 3) + except (AttributeError, TypeError): + try: + size_match = re.search(size_pattern, model_info.modelId.lower()) + model_size = size_match.group(0) + model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3) + except AttributeError: + return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py + + size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1 + model_size = size_factor * model_size + return model_size + + +def get_model_arch(model_info: ModelInfo): + return model_info.config.get("architectures", "Unknown") + + +def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota): + if org_or_user not in users_to_submission_dates: + return True, "" + submission_dates = sorted(users_to_submission_dates[org_or_user]) + + time_limit = (datetime.now(timezone.utc) - timedelta(days=rate_limit_period)).strftime("%Y-%m-%dT%H:%M:%SZ") + submissions_after_timelimit = [d for d in submission_dates if d > time_limit] + + num_models_submitted_in_period = len(submissions_after_timelimit) + if org_or_user in HAS_HIGHER_RATE_LIMIT: + rate_limit_quota = 2 * rate_limit_quota + + if num_models_submitted_in_period > rate_limit_quota: + error_msg = f"Organisation or user `{org_or_user}`" + error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard " + error_msg += f"in the last {rate_limit_period} days.\n" + error_msg += ( + "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard 🤗" + ) + return False, error_msg + return True, "" + + +def already_submitted_models(requested_models_dir: str) -> set[str]: + depth = 1 + file_names = [] + users_to_submission_dates = defaultdict(list) + + for root, _, files in os.walk(requested_models_dir): + current_depth = root.count(os.sep) - requested_models_dir.count(os.sep) + if current_depth == depth: + for file in files: + if not file.endswith(".json"): + continue + with open(os.path.join(root, file), "r") as f: + info = json.load(f) + if not info["status"] == "FINISHED" and not info["status"] == "RUNNING": + file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}_{info['gpu_type']}") + + # Select organisation + if info["model"].count("/") == 0 or "submitted_time" not in info: + continue + organisation, _ = info["model"].split("/") + users_to_submission_dates[organisation].append(info["submitted_time"]) + + return set(file_names), users_to_submission_dates diff --git a/open-moe-llm-leaderboard-gh/src/submission/submit.py b/open-moe-llm-leaderboard-gh/src/submission/submit.py new file mode 100644 index 0000000000000000000000000000000000000000..d9b861ec95d2ce88642e0628b97319472aba8b9d --- /dev/null +++ b/open-moe-llm-leaderboard-gh/src/submission/submit.py @@ -0,0 +1,148 @@ +import json +import os +from datetime import datetime, timezone + +from src.display.formatting import styled_error, styled_message, styled_warning +from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA, DEBUG_QUEUE_REPO +from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS +from src.submission.check_validity import ( + already_submitted_models, + check_model_card, + get_model_size, + is_model_on_hub, + user_submission_permission, +) + +REQUESTED_MODELS = None +USERS_TO_SUBMISSION_DATES = None + + +def add_new_eval( + model: str, + base_model: str, + revision: str, + precision: str, + private: bool, + weight_type: str, + model_type: str, + inference_framework: str, + debug: bool = False, + gpu_type: str = "NVIDIA-A100-PCIe-80GB", +): + global REQUESTED_MODELS + global USERS_TO_SUBMISSION_DATES + if not REQUESTED_MODELS: + REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH) + + if debug: + QUEUE_REPO = DEBUG_QUEUE_REPO + + user_name = "" + model_path = model + if "/" in model: + user_name = model.split("/")[0] + model_path = model.split("/")[1] + + precision = precision.split(" ")[0] + current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + if model_type is None or model_type == "": + return styled_error("Please select a model type.") + + # Is the user rate limited? + if user_name != "": + user_can_submit, error_msg = user_submission_permission( + user_name, USERS_TO_SUBMISSION_DATES, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA + ) + if not user_can_submit: + return styled_error(error_msg) + + # Did the model authors forbid its submission to the leaderboard? + if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS: + return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.") + + # Does the model actually exist? + if revision == "": + revision = "main" + + # Is the model on the hub? + if weight_type in ["Delta", "Adapter"]: + base_model_on_hub, error, _ = is_model_on_hub( + model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=False + ) + if not base_model_on_hub: + return styled_error(f'Base model "{base_model}" {error}') + + if not weight_type == "Adapter": + model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=False) + if not model_on_hub: + return styled_error(f'Model "{model}" {error}') + + # Is the model info correctly filled? + try: + model_info = API.model_info(repo_id=model, revision=revision) + except Exception: + return styled_error("Could not get your model information. Please fill it up properly.") + + model_size = get_model_size(model_info=model_info, precision=precision) + + # Were the model card and license filled? + try: + license = model_info.cardData["license"] + except Exception: + return styled_error("Please select a license for your model") + + # TODO: Check if the inference framework is valid + + modelcard_OK, error_msg = check_model_card(model) + if not modelcard_OK: + return styled_error(error_msg) + + # Seems good, creating the eval + print("Adding new eval") + + eval_entry = { + "model": model, + "base_model": base_model, + "revision": revision, + "private": private, + "precision": precision, + "weight_type": weight_type, + "status": "PENDING", + "submitted_time": current_time, + "model_type": model_type, + "likes": model_info.likes, + "params": model_size, + "license": license, + "inference_framework": inference_framework, + "gpu_type": gpu_type + } + + # Check for duplicate submission + if f"{model}_{revision}_{precision}_{inference_framework}_{gpu_type}" in REQUESTED_MODELS: + return styled_warning("This model has been already submitted.") + + print("Creating eval file") + OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}" + os.makedirs(OUT_DIR, exist_ok=True) + # out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json" + out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}_{inference_framework}_{gpu_type}.json" + + with open(out_path, "w") as f: + f.write(json.dumps(eval_entry)) + + print("Uploading eval file") + API.upload_file( + path_or_fileobj=out_path, + path_in_repo=out_path.split("eval-queue/")[1], + repo_id=QUEUE_REPO, + repo_type="dataset", + commit_message=f"Add {model} to eval queue", + ) + + # Remove the local file + os.remove(out_path) + + return styled_message( + "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list." + ) diff --git a/open-moe-llm-leaderboard-gh/src/utils.py b/open-moe-llm-leaderboard-gh/src/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..986072cbdd87b940c003d7ed3b158fdcd8ecd3fd --- /dev/null +++ b/open-moe-llm-leaderboard-gh/src/utils.py @@ -0,0 +1,236 @@ +import pandas as pd +from huggingface_hub import snapshot_download +import subprocess +import re +import os +import GPUtil + +try: + from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name +except: + print("local debug: from display.utils") + from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name + +MEM_BW_DICT ={ + "NVIDIA-A100-PCIe-80GB": 1935, + "NVIDIA-A100-SXM-80GB": 2039, + "NVIDIA-H100-PCIe-80GB": 2039, + "NVIDIA-RTX-A5000-24GB": 768 +} + +PEAK_FLOPS_DICT = { + "float32":{ + "NVIDIA-A100-PCIe-80GB": 312e12, + "NVIDIA-A100-SXM-80GB": 312e12, + "NVIDIA-H100-PCIe-80GB": 756e12, + "NVIDIA-RTX-A5000-24GB": 222.2e12 + }, + "float16":{ + "NVIDIA-A100-PCIe-80GB": 624e12, + "NVIDIA-A100-SXM-80GB": 624e12, + "NVIDIA-H100-PCIe-80GB": 1513e12, + "NVIDIA-RTX-A5000-24GB": 444.4e12 + }, + "8bit":{ + "NVIDIA-A100-PCIe-80GB": 1248e12, + "NVIDIA-A100-SXM-80GB": 1248e12, + "NVIDIA-H100-PCIe-80GB": 3026e12, + "NVIDIA-RTX-A5000-24GB": 889e12 + }, + "4bit": { + "NVIDIA-A100-PCIe-80GB": 2496e12, + "NVIDIA-A100-SXM-80GB": 2496e12, + "NVIDIA-H100-PCIe-80GB": 6052e12, + "NVIDIA-RTX-A5000-24GB": 1778e12 + } + +} + +def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers): + for i in range(10): + try: + snapshot_download( + repo_id=repo_id, revision=revision, local_dir=local_dir, repo_type=repo_type, max_workers=max_workers + ) + return + except Exception as e: + print(f"Failed to download {repo_id} at {revision} with error: {e}. Retrying...") + import time + + time.sleep(60) + return + + +def get_dataset_url(row): + dataset_name = row["Benchmark"] + dataset_url = row["Dataset Link"] + benchmark = f'{dataset_name}' + return benchmark + + +def get_dataset_summary_table(file_path): + df = pd.read_csv(file_path) + + df["Benchmark"] = df.apply(lambda x: get_dataset_url(x), axis=1) + + df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]] + + return df + +def parse_nvidia_smi(): + visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None) + if visible_devices is not None: + gpu_indices = visible_devices.split(',') + else: + # Query all GPU indices if CUDA_VISIBLE_DEVICES is not set + result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True) + if result.returncode != 0: + print("Failed to query GPU indices.") + return [] + gpu_indices = result.stdout.strip().split('\n') + # print(f"gpu_indices: {gpu_indices}") + gpu_stats = [] + + gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%') + gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)') + + gpu_name = "" + for index in gpu_indices: + result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True) + output = result.stdout.strip() + lines = output.split("\n") + for line in lines: + match = gpu_info_pattern.search(line) + name_match = gpu_name_pattern.search(line) + gpu_info = {} + if name_match: + gpu_name = name_match.group(1).strip() + if match: + temp, power_usage, mem_usage, gpu_util = map(int, match.groups()) + gpu_info.update({ + GPU_TEMP: temp, + GPU_Power: power_usage, + GPU_Mem: round(mem_usage / 1024, 2), + GPU_Util: gpu_util + }) + + if len(gpu_info) >= 4: + gpu_stats.append(gpu_info) + # print(f"gpu_stats: {gpu_stats}") + gpu_name = f"{len(gpu_stats)}x{gpu_name}" + gpu_stats_total = { + GPU_TEMP: 0, + GPU_Power: 0, + GPU_Mem: 0, + GPU_Util: 0, + GPU_Name: gpu_name + } + for gpu_stat in gpu_stats: + gpu_stats_total[GPU_TEMP] += gpu_stat[GPU_TEMP] + gpu_stats_total[GPU_Power] += gpu_stat[GPU_Power] + gpu_stats_total[GPU_Mem] += gpu_stat[GPU_Mem] + gpu_stats_total[GPU_Util] += gpu_stat[GPU_Util] + gpu_stats_total[GPU_Mem] = gpu_stats_total[GPU_Mem] # G + gpu_stats_total[GPU_TEMP] /= len(gpu_stats) + gpu_stats_total[GPU_Power] /= len(gpu_stats) + gpu_stats_total[GPU_Util] /= len(gpu_stats) + return [gpu_stats_total] + +def monitor_gpus(stop_event, interval, stats_list): + while not stop_event.is_set(): + gpu_stats = parse_nvidia_smi() + if gpu_stats: + stats_list.extend(gpu_stats) + stop_event.wait(interval) + +def analyze_gpu_stats(stats_list): + # Check if the stats_list is empty, and return None if it is + if not stats_list: + return None + + # Initialize dictionaries to store the stats + avg_stats = {} + max_stats = {} + + # Calculate average stats, excluding 'GPU_Mem' + for key in stats_list[0].keys(): + if key != GPU_Mem and key != GPU_Name: + total = sum(d[key] for d in stats_list) + avg_stats[key] = total / len(stats_list) + + # Calculate max stats for 'GPU_Mem' + max_stats[GPU_Mem] = max(d[GPU_Mem] for d in stats_list) + if GPU_Name in stats_list[0]: + avg_stats[GPU_Name] = stats_list[0][GPU_Name] + # Update average stats with max GPU memory usage + avg_stats.update(max_stats) + + return avg_stats + +def get_gpu_number(): + visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None) + if visible_devices is not None: + gpu_indices = visible_devices.split(',') + else: + # Query all GPU indices if CUDA_VISIBLE_DEVICES is not set + result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True) + if result.returncode != 0: + print("Failed to query GPU indices.") + return [] + gpu_indices = result.stdout.strip().split('\n') + # print(f"gpu_indices: {gpu_indices}") + gpu_stats = [] + + gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%') + + for index in gpu_indices: + result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True) + output = result.stdout.strip() + lines = output.split("\n") + for line in lines: + match = gpu_info_pattern.search(line) + gpu_info = {} + if match: + temp, power_usage, mem_usage, gpu_util = map(int, match.groups()) + gpu_info.update({ + GPU_TEMP: temp, + GPU_Power: power_usage, + GPU_Mem: round(mem_usage / 1024, 2), + GPU_Util: gpu_util + }) + + if len(gpu_info) >= 4: + gpu_stats.append(gpu_info) + + return len(gpu_stats) + +def get_gpu_details(): + gpus = GPUtil.getGPUs() + gpu = gpus[0] + name = gpu.name.replace(" ", "-") + # Convert memory from MB to GB and round to nearest whole number + memory_gb = round(gpu.memoryTotal / 1024) + memory = f"{memory_gb}GB" + formatted_name = f"{name}-{memory}" + return formatted_name + +def get_peak_bw(gpu_name): + return MEM_BW_DICT[gpu_name] + +def get_peak_flops(gpu_name, precision): + return PEAK_FLOPS_DICT[precision][gpu_name] + +def transfer_precision2bytes(precision): + if precision == "float32": + return 4 + elif precision == "float16": + return 2 + elif precision == "8bit": + return 1 + elif precision == "4bit": + return 0.5 + else: + raise ValueError(f"Unsupported precision: {precision}") + +if __name__ == "__main__": + print(analyze_gpu_stats(parse_nvidia_smi()))