Spaces:

sparse-generative-ai
/

open-moe-llm-leaderboard

Running

App Files Files Community

chivier commited on May 16, 2024

Commit

1aecf91

1 Parent(s): 27bdb63

sync from github

Browse files

Files changed (9) hide show

open-moe-llm-leaderboard-gh/backend-cli.py +1 -0
open-moe-llm-leaderboard-gh/src/backend/hflm_with_measurement.py +145 -50
open-moe-llm-leaderboard-gh/src/backend/moe_infinity.py +2 -1
open-moe-llm-leaderboard-gh/src/backend/run_eval_suite.py +8 -0
open-moe-llm-leaderboard-gh/src/backend/tasks/gsm8k/gsm8k-custom.yaml +10 -7
open-moe-llm-leaderboard-gh/src/backend/tasks/measurement_task_utils.py +9 -0
open-moe-llm-leaderboard-gh/src/display/utils.py +5 -1
open-moe-llm-leaderboard-gh/src/submission/check_validity.py +1 -1
open-moe-llm-leaderboard-gh/src/utils.py +16 -4

open-moe-llm-leaderboard-gh/backend-cli.py CHANGED Viewed

@@ -473,6 +473,7 @@ if __name__ == "__main__":
         precisions = args.precision.split(",")
         print(f"debug_model_names: {debug_model_names}, debug_task_name: {debug_task_name}, precisions: {precisions}")
         task_lst = TASKS_HARNESS.copy()
         for precision in precisions:
             for debug_model_name in debug_model_names:
                 for task in task_lst:

         precisions = args.precision.split(",")
         print(f"debug_model_names: {debug_model_names}, debug_task_name: {debug_task_name}, precisions: {precisions}")
         task_lst = TASKS_HARNESS.copy()
+        RESULTS_REPO = DEBUG_RESULTS_REPO
         for precision in precisions:
             for debug_model_name in debug_model_names:
                 for task in task_lst:

open-moe-llm-leaderboard-gh/src/backend/hflm_with_measurement.py CHANGED Viewed

@@ -37,6 +37,9 @@ from lm_eval.models.utils import (
     stop_sequences_criteria,
 )
 from lm_eval.models.huggingface import HFLM
 class StopWatch(TextStreamer):
@@ -67,6 +70,9 @@ class StopWatch(TextStreamer):
 class HFLMWithMeasurement(HFLM):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
     def _loglikelihood_tokens(
         self,
@@ -288,7 +294,7 @@ class HFLMWithMeasurement(HFLM):
         return re_ord.get_original(res)
-    def _model_generate(self, context, max_length, stop, **generation_kwargs):
         # temperature = 0.0 if not set
         # if do_sample is false and temp==0.0:
         # remove temperature, as do_sample=False takes care of this
@@ -296,7 +302,7 @@ class HFLMWithMeasurement(HFLM):
         generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
         do_sample = generation_kwargs.get("do_sample", None)
-        is_gsm8k = generation_kwargs.get("is_gsm8k", False)
         # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
         if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
@@ -305,48 +311,133 @@ class HFLMWithMeasurement(HFLM):
         if do_sample is False and generation_kwargs.get("temperature") == 0.0:
             generation_kwargs.pop("temperature")
-        generation_kwargs.pop("is_gsm8k")
-        if not is_gsm8k:
-        # build stopping criteria
-            stopping_criteria = stop_sequences_criteria(
-                self.tokenizer, stop, context.shape[1], context.shape[0]
-            )
-            stop_watch = StopWatch(self.tokenizer)
-            start = time()
-            res = self.model.generate(
-                input_ids=context,
-                max_length=max_length,
-                stopping_criteria=stopping_criteria,
-                pad_token_id=self.tokenizer.pad_token_id,
-                use_cache=True,
-                streamer=stop_watch,
-                **generation_kwargs,
-            )
-            end = time()
         else:
-            # print("Using GSM8K")
-            stop_watch = StopWatch(self.tokenizer)
-            start = time()
-            res = self.model.generate(
-                input_ids=context,
-                max_length=max_length,
-                eos_token_id=stop,
-                pad_token_id=self.tokenizer.pad_token_id,
-                use_cache=True,
-                streamer=stop_watch,
-                **generation_kwargs,
-            )
-            end = time()
         batch_size = context.shape[0]
         output_length = stop_watch.decoding_iterations
         end_to_end_time = (end - start) / batch_size
         prefilling_time = stop_watch.prefilling_time / batch_size
         decoding_time = stop_watch.decoding_time / batch_size
         token_per_sec = output_length / decoding_time
-        return res, end_to_end_time, prefilling_time, token_per_sec
     def generate_until(
         self, requests: List[Instance], disable_tqdm: bool = False
@@ -423,15 +514,18 @@ class HFLMWithMeasurement(HFLM):
                     f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                 )
             # add EOS token to stop sequences
-            eos = self.tok_decode(self.eot_token_id)
             if not until:
                 until = [eos]
             else:
                 until.append(eos)
-            is_gsm8k = kwargs.get("is_gsm8k", False)
-            if is_gsm8k:
-                until = [self.tokenizer.eos_token_id, self.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
             if "max_gen_toks" in kwargs.keys():
                 max_gen_toks = kwargs.pop("max_gen_toks")
@@ -457,11 +551,11 @@ class HFLMWithMeasurement(HFLM):
             context_enc = context_enc.to(self.device)
             attn_masks = attn_masks.to(self.device)
-            if "max_length" not in kwargs:
-                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
             # perform batched generation
-            cont, end_to_end_time, prefilling_time, token_per_sec = self._model_generate(
                 context=context_enc,
                 attention_mask=attn_masks,
                 stop=until,
@@ -477,15 +571,16 @@ class HFLMWithMeasurement(HFLM):
                 s = self.tok_decode(cont_toks)
-                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
-                if not is_gsm8k:
-                    for term in until:
-                        if len(term) > 0:
-                            # ignore '' separator,
-                            # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
-                            s = s.split(term)[0]
-                res.append((s, end_to_end_time, prefilling_time, token_per_sec))
                 self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
                 pbar.update(1)

     stop_sequences_criteria,
 )
 from lm_eval.models.huggingface import HFLM
+from src.utils import get_gpu_number, get_gpu_details, get_peak_bw, transfer_precision2bytes, get_peak_flops
+from src.submission.check_validity import get_model_size
+from src.envs import API
 class StopWatch(TextStreamer):
 class HFLMWithMeasurement(HFLM):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+        self.pretrained = kwargs.get("pretrained", None)
+        self.revision = kwargs.get("revision", None)
+        self.precision = kwargs.get("dtype", None)
     def _loglikelihood_tokens(
         self,
         return re_ord.get_original(res)
+    def _model_generate(self, context, max_tokens, stop, **generation_kwargs):
         # temperature = 0.0 if not set
         # if do_sample is false and temp==0.0:
         # remove temperature, as do_sample=False takes care of this
         generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
         do_sample = generation_kwargs.get("do_sample", None)
+        # is_gsm8k = generation_kwargs.get("is_gsm8k", False)
         # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
         if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
         if do_sample is False and generation_kwargs.get("temperature") == 0.0:
             generation_kwargs.pop("temperature")
+        # if is_gsm8k:
+        #     generation_kwargs.pop("is_gsm8k")
+        context_length = context.shape[1]
+        if self.model.__class__.__name__ == "MoE":
+            model_config = self.model.model.config
         else:
+            model_config = self.model.config
+        if not self.precision:
+            if model_config.quantization_config._load_in_4bit:
+                self.precision = "4bit"
+            elif model_config.quantization_config._load_in_8bit:
+                self.precision = "8bit"
+            else:
+                raise ValueError("Unknown precision")
+        # print(self.model)
+        linear_count = 0
+        element_wise_mul = 0
+        for name, module in self.model.named_modules():
+            if ('layers.0.' in name or 'decoder.0.' in name) and ('attn' not in name):
+                if 'experts.0.' in name:
+                    if isinstance(module, torch.nn.Linear):
+                        # print(name, module)
+                        linear_count += 1
+                elif 'experts' not in name:
+                    if "gate" not in name or "gate_proj" in name:
+                        if "gate_proj" in name:
+                            element_wise_mul = 1
+                        if isinstance(module, torch.nn.Linear):
+                            # print(name, module)
+                            linear_count += 1
+                else:
+                    continue
+        print(f"linear_count: {linear_count}")
+        stopping_criteria = stop_sequences_criteria(
+            self.tokenizer, stop, context.shape[1], context.shape[0]
+        )
+        stop_watch = StopWatch(self.tokenizer)
+        start = time()
+        res = self.model.generate(
+            input_ids=context,
+            max_new_tokens=max_tokens,
+            stopping_criteria=stopping_criteria,
+            pad_token_id=self.tokenizer.pad_token_id,
+            use_cache=True,
+            streamer=stop_watch,
+            **generation_kwargs,
+        )
+        end = time()
         batch_size = context.shape[0]
         output_length = stop_watch.decoding_iterations
+        precision_bytes = transfer_precision2bytes(self.precision)
+        model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
+        model_size_param = get_model_size(model_info=model_info, precision=self.precision)
+        n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else model_config.num_layers
+        d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
+        if hasattr(model_config, "num_experts_per_tok"):
+            n_experts_per_tok = model_config.num_experts_per_tok
+        elif hasattr(model_config, "num_selected_experts"):
+            n_experts_per_tok = model_config.num_selected_experts
+        else:
+            n_experts_per_tok = 1
+        if hasattr(model_config, "ffn_dim"):
+            d_ff = model_config.ffn_dim
+        elif hasattr(model_config, "intermediate_size"):
+            d_ff = model_config.intermediate_size
+        elif hasattr(model_config, "d_ff"):
+            d_ff = model_config.d_ff
+        else:
+            if hasattr(model_config, "ff_ratio"):
+                d_ff = d_model * model_config.ff_ratio
+            else:
+                raise ValueError("Unknown FFN dimension")
+        if hasattr(model_config, "num_local_experts"):
+            num_experts = model_config.num_local_experts
+        elif hasattr(model_config, "num_experts"):
+            num_experts = model_config.num_experts
+        else:
+            num_experts = 1
+        ffn_params = n_layers * d_ff * linear_count * d_model
+        shared_params = model_size_param * 1e9 - num_experts * ffn_params
+        model_size = shared_params + n_experts_per_tok * ffn_params
+        per_token_kv_size = 2 * n_layers * d_model * precision_bytes
+        peak_bw_single = get_peak_bw(get_gpu_details())
+        peak_bw = peak_bw_single * get_gpu_number()
+        context_prefill_size = context_length
+        kv_size = context_prefill_size * per_token_kv_size + (output_length - 1) * per_token_kv_size / 2
+        kv_size = kv_size / 1e9
+        n_vocab = model_config.vocab_size
         end_to_end_time = (end - start) / batch_size
         prefilling_time = stop_watch.prefilling_time / batch_size
         decoding_time = stop_watch.decoding_time / batch_size
         token_per_sec = output_length / decoding_time
+        achieve_mem_bw = (model_size * precision_bytes / 1e9 + kv_size) * token_per_sec
+        avg_context_length = context_length + (output_length - 1) / 2
+        flops_per_token = 2 * model_size + ((linear_count + element_wise_mul) * n_layers * avg_context_length * d_model) + 4 * d_model + 2 * d_model * n_vocab
+        peak_flops_single = get_peak_flops(get_gpu_details(), self.precision)
+        peak_flops = peak_flops_single * get_gpu_number()
+        ## TODO only support llama-type decoder only models and moe models of switch transformer and mixtrial
+        mfu = token_per_sec * flops_per_token / peak_flops
+        mbu = achieve_mem_bw / peak_bw
+        print(f"mfu: {mfu}, mbu: {mbu}")
+        return res, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu
     def generate_until(
         self, requests: List[Instance], disable_tqdm: bool = False
                     f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                 )
             # add EOS token to stop sequences
+            eos = "<|eot_id|>"
             if not until:
                 until = [eos]
             else:
                 until.append(eos)
+            # is_gsm8k = kwargs.get("is_gsm8k", False)
+            # if is_gsm8k:
+            #     until = ["Question:", "Question", "</s>"]
+            #     eos_ids = [self.tokenizer.eos_token_id,
+            #              self.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
             if "max_gen_toks" in kwargs.keys():
                 max_gen_toks = kwargs.pop("max_gen_toks")
             context_enc = context_enc.to(self.device)
             attn_masks = attn_masks.to(self.device)
+            if "max_tokens" not in kwargs:
+                kwargs["max_tokens"] = max_gen_toks
             # perform batched generation
+            cont, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu = self._model_generate(
                 context=context_enc,
                 attention_mask=attn_masks,
                 stop=until,
                 s = self.tok_decode(cont_toks)
+                # # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                # if not is_gsm8k:
+                for term in until:
+                    if len(term) > 0:
+                        # ignore '' separator,
+                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                        s = s.split(term)[0]
+                # print(s)
+                res.append((s, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu))
                 self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
                 pbar.update(1)

open-moe-llm-leaderboard-gh/src/backend/moe_infinity.py CHANGED Viewed

@@ -31,8 +31,9 @@ class MoEHFLM(HFLMWithMeasurement):
         self.use_chat_template = use_chat_template
         if "device" in kwargs:
             kwargs.pop("device")
         super().__init__(
-            *args, **kwargs, pretrained=pretrained, device_map="cuda:0"
         )  # Assuming HFLM accepts a 'pretrained' arg and handles it
         # self._create_model()
         shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))

         self.use_chat_template = use_chat_template
         if "device" in kwargs:
             kwargs.pop("device")
+        kwargs["device_map"] = "cuda:0"
         super().__init__(
+            *args, **kwargs, pretrained=pretrained
         )  # Assuming HFLM accepts a 'pretrained' arg and handles it
         # self._create_model()
         shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))

open-moe-llm-leaderboard-gh/src/backend/run_eval_suite.py CHANGED Viewed

@@ -17,12 +17,16 @@ def process_results_decorator(func):
         end_to_end_time = sum([r[1] for r in results]) / len(results)
         prefilling_time = sum([r[2] for r in results]) / len(results)
         decoding_throughput = sum([r[3] for r in results]) / len(results)
         # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         result_dict = func(self, doc, processed_results, *args, **kwargs)
         result_dict["end_to_end_time"] = end_to_end_time
         result_dict["prefilling_time"] = prefilling_time
         result_dict["decoding_throughput"] = decoding_throughput
         return result_dict
     return wrapper
 ConfigurableTask.process_results = process_results_decorator(orig_process_results)
@@ -33,6 +37,8 @@ def aggregation_decorator(func):
         aggregation_list["end_to_end_time"] = mean
         aggregation_list["prefilling_time"] = mean
         aggregation_list["decoding_throughput"] = mean
         return aggregation_list
     return wrapper
 ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
@@ -43,6 +49,8 @@ def higher_is_better_decorator(func):
         higher_is_better_dict["end_to_end_time"] = False
         higher_is_better_dict["prefilling_time"] = False
         higher_is_better_dict["decoding_throughput"] = True
         return higher_is_better_dict
     return wrapper
 ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)

         end_to_end_time = sum([r[1] for r in results]) / len(results)
         prefilling_time = sum([r[2] for r in results]) / len(results)
         decoding_throughput = sum([r[3] for r in results]) / len(results)
+        mfu = sum([r[4] for r in results]) / len(results)
+        mbu = sum([r[5] for r in results]) / len(results)
         # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         result_dict = func(self, doc, processed_results, *args, **kwargs)
         result_dict["end_to_end_time"] = end_to_end_time
         result_dict["prefilling_time"] = prefilling_time
         result_dict["decoding_throughput"] = decoding_throughput
+        result_dict["mfu"] = mfu * 100
+        result_dict["mbu"] = mbu * 100
         return result_dict
     return wrapper
 ConfigurableTask.process_results = process_results_decorator(orig_process_results)
         aggregation_list["end_to_end_time"] = mean
         aggregation_list["prefilling_time"] = mean
         aggregation_list["decoding_throughput"] = mean
+        aggregation_list["mfu"] = mean
+        aggregation_list["mbu"] = mean
         return aggregation_list
     return wrapper
 ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
         higher_is_better_dict["end_to_end_time"] = False
         higher_is_better_dict["prefilling_time"] = False
         higher_is_better_dict["decoding_throughput"] = True
+        higher_is_better_dict["mfu"] = True
+        higher_is_better_dict["mbu"] = True
         return higher_is_better_dict
     return wrapper
 ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)

open-moe-llm-leaderboard-gh/src/backend/tasks/gsm8k/gsm8k-custom.yaml CHANGED Viewed

@@ -22,18 +22,21 @@ metric_list:
       - "\\.$"
 generation_kwargs:
   until:
-    - "<|eot_id|>"
   do_sample: false
   temperature: 0.0
-  is_gsm8k: true
 repeats: 1
 num_fewshot: 5
 filter_list:
-  # - name: "strict-match"
-  #   filter:
-  #     - function: "regex"
-  #       regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
-  #     - function: "take_first"
   - name: "flexible-extract"
     filter:
       - function: "regex"

       - "\\.$"
 generation_kwargs:
   until:
+    - "Question:"
+    - "Question"
+    - "</s>"
+    - "<|im_end|>"
   do_sample: false
   temperature: 0.0
+  # is_gsm8k: true
 repeats: 1
 num_fewshot: 5
 filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
   - name: "flexible-extract"
     filter:
       - function: "regex"

open-moe-llm-leaderboard-gh/src/backend/tasks/measurement_task_utils.py CHANGED Viewed

@@ -12,6 +12,9 @@ def process_results_decorator(func):
         end_to_end_time = sum([r[1] for r in results]) / len(results)
         prefilling_time = sum([r[2] for r in results]) / len(results)
         decoding_throughput = sum([r[3] for r in results]) / len(results)
         # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         # Now call the original process_results with the processed results
@@ -19,6 +22,8 @@ def process_results_decorator(func):
         result_dict["end_to_end_time"] = end_to_end_time
         result_dict["prefilling_time"] = prefilling_time
         result_dict["decoding_throughput"] = decoding_throughput
         return result_dict
     return wrapper
@@ -30,6 +35,8 @@ def aggregation_decorator(func):
         aggregation_list["end_to_end_time"] = mean
         aggregation_list["prefilling_time"] = mean
         aggregation_list["decoding_throughput"] = mean
         return aggregation_list
     return wrapper
@@ -41,6 +48,8 @@ def higher_is_better_decorator(func):
         higher_is_better_dict["end_to_end_time"] = False
         higher_is_better_dict["prefilling_time"] = False
         higher_is_better_dict["decoding_throughput"] = True
         return higher_is_better_dict
     return wrapper

         end_to_end_time = sum([r[1] for r in results]) / len(results)
         prefilling_time = sum([r[2] for r in results]) / len(results)
         decoding_throughput = sum([r[3] for r in results]) / len(results)
+        mfu = sum([r[4] for r in results]) / len(results)
+        mbu = sum([r[5] for r in results]) / len(results)
         # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         # Now call the original process_results with the processed results
         result_dict["end_to_end_time"] = end_to_end_time
         result_dict["prefilling_time"] = prefilling_time
         result_dict["decoding_throughput"] = decoding_throughput
+        result_dict["mfu"] = mfu
+        result_dict["mbu"] = mbu
         return result_dict
     return wrapper
         aggregation_list["end_to_end_time"] = mean
         aggregation_list["prefilling_time"] = mean
         aggregation_list["decoding_throughput"] = mean
+        aggregation_list["mfu"] = mean
+        aggregation_list["mbu"] = mean
         return aggregation_list
     return wrapper
         higher_is_better_dict["end_to_end_time"] = False
         higher_is_better_dict["prefilling_time"] = False
         higher_is_better_dict["decoding_throughput"] = True
+        higher_is_better_dict["mfu"] = True
+        higher_is_better_dict["mbu"] = True
         return higher_is_better_dict
     return wrapper

open-moe-llm-leaderboard-gh/src/display/utils.py CHANGED Viewed

@@ -18,12 +18,16 @@ GPU_Power = 'Power(W)'
 GPU_Mem = 'Mem(G)'
 GPU_Name = "GPU"
 GPU_Util = 'Util(%)'
 BATCH_SIZE = 'bs'
 PRECISION = "Precision"
 system_metrics_to_name_map = {
     "end_to_end_time": f"{E2Es}",
     "prefilling_time": f"{PREs}",
     "decoding_throughput": f"{TS}",
 }
 gpu_metrics_to_name_map = {
@@ -75,7 +79,7 @@ class Tasks(Enum):
     # # XXX include me back at some point
     selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
     mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
-    gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (8-shot)
 # These classes are for user facing column names,

 GPU_Mem = 'Mem(G)'
 GPU_Name = "GPU"
 GPU_Util = 'Util(%)'
+MFU = 'MFU(%)'
+MBU = 'MBU(%)'
 BATCH_SIZE = 'bs'
 PRECISION = "Precision"
 system_metrics_to_name_map = {
     "end_to_end_time": f"{E2Es}",
     "prefilling_time": f"{PREs}",
     "decoding_throughput": f"{TS}",
+    "mfu": f"{MFU}",
+    "mbu": f"{MBU}"
 }
 gpu_metrics_to_name_map = {
     # # XXX include me back at some point
     selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
     mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
+    gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot)
 # These classes are for user facing column names,

open-moe-llm-leaderboard-gh/src/submission/check_validity.py CHANGED Viewed

@@ -74,7 +74,7 @@ def is_model_on_hub(
 def get_model_size(model_info: ModelInfo, precision: str):
-    size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
         model_size = round(model_info.safetensors["total"] / 1e9, 3)
     except (AttributeError, TypeError):

 def get_model_size(model_info: ModelInfo, precision: str):
+    size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
         model_size = round(model_info.safetensors["total"] / 1e9, 3)
     except (AttributeError, TypeError):

open-moe-llm-leaderboard-gh/src/utils.py CHANGED Viewed

@@ -31,6 +31,12 @@ PEAK_FLOPS_DICT = {
         "NVIDIA-H100-PCIe-80GB": 1513e12,
         "NVIDIA-RTX-A5000-24GB": 444.4e12
     },
     "8bit":{
         "NVIDIA-A100-PCIe-80GB": 1248e12,
         "NVIDIA-A100-SXM-80GB": 1248e12,
@@ -92,7 +98,8 @@ def parse_nvidia_smi():
     gpu_stats = []
     gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
-    gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
     gpu_name = ""
     for index in gpu_indices:
@@ -104,7 +111,7 @@ def parse_nvidia_smi():
             name_match = gpu_name_pattern.search(line)
             gpu_info = {}
             if name_match:
-                gpu_name = name_match.group(1).strip()
             if match:
                 temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
                 gpu_info.update({
@@ -208,10 +215,15 @@ def get_gpu_details():
     gpus = GPUtil.getGPUs()
     gpu = gpus[0]
     name = gpu.name.replace(" ", "-")
-    # Convert memory from MB to GB and round to nearest whole number
     memory_gb = round(gpu.memoryTotal / 1024)
     memory = f"{memory_gb}GB"
     formatted_name = f"{name}-{memory}"
     return formatted_name
 def get_peak_bw(gpu_name):
@@ -223,7 +235,7 @@ def get_peak_flops(gpu_name, precision):
 def transfer_precision2bytes(precision):
     if precision == "float32":
         return 4
-    elif precision == "float16":
         return 2
     elif precision == "8bit":
         return 1

         "NVIDIA-H100-PCIe-80GB": 1513e12,
         "NVIDIA-RTX-A5000-24GB": 444.4e12
     },
+    "bfloat16":{
+        "NVIDIA-A100-PCIe-80GB": 624e12,
+        "NVIDIA-A100-SXM-80GB": 624e12,
+        "NVIDIA-H100-PCIe-80GB": 1513e12,
+        "NVIDIA-RTX-A5000-24GB": 444.4e12
+    },
     "8bit":{
         "NVIDIA-A100-PCIe-80GB": 1248e12,
         "NVIDIA-A100-SXM-80GB": 1248e12,
     gpu_stats = []
     gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
+    # gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
+    gpu_name_pattern = re.compile(r'NVIDIA\s+(RTX\s+)?([A-Z0-9]+)')
     gpu_name = ""
     for index in gpu_indices:
             name_match = gpu_name_pattern.search(line)
             gpu_info = {}
             if name_match:
+                gpu_name = ''.join(filter(None, name_match.groups())).strip()
             if match:
                 temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
                 gpu_info.update({
     gpus = GPUtil.getGPUs()
     gpu = gpus[0]
     name = gpu.name.replace(" ", "-")
     memory_gb = round(gpu.memoryTotal / 1024)
     memory = f"{memory_gb}GB"
+    for part in name.split('-'):
+        if part.endswith("GB") and part[:-2].isdigit():
+            name = name.replace(f"-{part}", "").replace(part, "")
     formatted_name = f"{name}-{memory}"
     return formatted_name
 def get_peak_bw(gpu_name):
 def transfer_precision2bytes(precision):
     if precision == "float32":
         return 4
+    elif precision in ["float16", "bfloat16"]:
         return 2
     elif precision == "8bit":
         return 1