chivier commited on
Commit
1aecf91
·
1 Parent(s): 27bdb63

sync from github

Browse files
open-moe-llm-leaderboard-gh/backend-cli.py CHANGED
@@ -473,6 +473,7 @@ if __name__ == "__main__":
473
  precisions = args.precision.split(",")
474
  print(f"debug_model_names: {debug_model_names}, debug_task_name: {debug_task_name}, precisions: {precisions}")
475
  task_lst = TASKS_HARNESS.copy()
 
476
  for precision in precisions:
477
  for debug_model_name in debug_model_names:
478
  for task in task_lst:
 
473
  precisions = args.precision.split(",")
474
  print(f"debug_model_names: {debug_model_names}, debug_task_name: {debug_task_name}, precisions: {precisions}")
475
  task_lst = TASKS_HARNESS.copy()
476
+ RESULTS_REPO = DEBUG_RESULTS_REPO
477
  for precision in precisions:
478
  for debug_model_name in debug_model_names:
479
  for task in task_lst:
open-moe-llm-leaderboard-gh/src/backend/hflm_with_measurement.py CHANGED
@@ -37,6 +37,9 @@ from lm_eval.models.utils import (
37
  stop_sequences_criteria,
38
  )
39
  from lm_eval.models.huggingface import HFLM
 
 
 
40
 
41
 
42
  class StopWatch(TextStreamer):
@@ -67,6 +70,9 @@ class StopWatch(TextStreamer):
67
  class HFLMWithMeasurement(HFLM):
68
  def __init__(self, **kwargs):
69
  super().__init__(**kwargs)
 
 
 
70
 
71
  def _loglikelihood_tokens(
72
  self,
@@ -288,7 +294,7 @@ class HFLMWithMeasurement(HFLM):
288
 
289
  return re_ord.get_original(res)
290
 
291
- def _model_generate(self, context, max_length, stop, **generation_kwargs):
292
  # temperature = 0.0 if not set
293
  # if do_sample is false and temp==0.0:
294
  # remove temperature, as do_sample=False takes care of this
@@ -296,7 +302,7 @@ class HFLMWithMeasurement(HFLM):
296
  generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
297
  do_sample = generation_kwargs.get("do_sample", None)
298
 
299
- is_gsm8k = generation_kwargs.get("is_gsm8k", False)
300
 
301
  # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
302
  if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
@@ -305,48 +311,133 @@ class HFLMWithMeasurement(HFLM):
305
  if do_sample is False and generation_kwargs.get("temperature") == 0.0:
306
  generation_kwargs.pop("temperature")
307
 
308
- generation_kwargs.pop("is_gsm8k")
 
 
 
309
 
310
- if not is_gsm8k:
311
- # build stopping criteria
312
- stopping_criteria = stop_sequences_criteria(
313
- self.tokenizer, stop, context.shape[1], context.shape[0]
314
- )
315
- stop_watch = StopWatch(self.tokenizer)
316
- start = time()
317
- res = self.model.generate(
318
- input_ids=context,
319
- max_length=max_length,
320
- stopping_criteria=stopping_criteria,
321
- pad_token_id=self.tokenizer.pad_token_id,
322
- use_cache=True,
323
- streamer=stop_watch,
324
- **generation_kwargs,
325
- )
326
- end = time()
327
  else:
328
- # print("Using GSM8K")
329
- stop_watch = StopWatch(self.tokenizer)
330
- start = time()
331
- res = self.model.generate(
332
- input_ids=context,
333
- max_length=max_length,
334
- eos_token_id=stop,
335
- pad_token_id=self.tokenizer.pad_token_id,
336
- use_cache=True,
337
- streamer=stop_watch,
338
- **generation_kwargs,
339
- )
340
- end = time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
  batch_size = context.shape[0]
343
  output_length = stop_watch.decoding_iterations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
  end_to_end_time = (end - start) / batch_size
346
  prefilling_time = stop_watch.prefilling_time / batch_size
347
  decoding_time = stop_watch.decoding_time / batch_size
348
  token_per_sec = output_length / decoding_time
349
- return res, end_to_end_time, prefilling_time, token_per_sec
 
 
 
 
 
 
 
 
 
 
 
 
 
350
 
351
  def generate_until(
352
  self, requests: List[Instance], disable_tqdm: bool = False
@@ -423,15 +514,18 @@ class HFLMWithMeasurement(HFLM):
423
  f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
424
  )
425
  # add EOS token to stop sequences
426
- eos = self.tok_decode(self.eot_token_id)
427
  if not until:
428
  until = [eos]
429
  else:
430
  until.append(eos)
431
 
432
- is_gsm8k = kwargs.get("is_gsm8k", False)
433
- if is_gsm8k:
434
- until = [self.tokenizer.eos_token_id, self.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
 
 
 
435
 
436
  if "max_gen_toks" in kwargs.keys():
437
  max_gen_toks = kwargs.pop("max_gen_toks")
@@ -457,11 +551,11 @@ class HFLMWithMeasurement(HFLM):
457
  context_enc = context_enc.to(self.device)
458
  attn_masks = attn_masks.to(self.device)
459
 
460
- if "max_length" not in kwargs:
461
- kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
462
 
463
  # perform batched generation
464
- cont, end_to_end_time, prefilling_time, token_per_sec = self._model_generate(
465
  context=context_enc,
466
  attention_mask=attn_masks,
467
  stop=until,
@@ -477,15 +571,16 @@ class HFLMWithMeasurement(HFLM):
477
 
478
  s = self.tok_decode(cont_toks)
479
 
480
- # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
481
- if not is_gsm8k:
482
- for term in until:
483
- if len(term) > 0:
484
- # ignore '' separator,
485
- # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
486
- s = s.split(term)[0]
487
-
488
- res.append((s, end_to_end_time, prefilling_time, token_per_sec))
 
489
 
490
  self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
491
  pbar.update(1)
 
37
  stop_sequences_criteria,
38
  )
39
  from lm_eval.models.huggingface import HFLM
40
+ from src.utils import get_gpu_number, get_gpu_details, get_peak_bw, transfer_precision2bytes, get_peak_flops
41
+ from src.submission.check_validity import get_model_size
42
+ from src.envs import API
43
 
44
 
45
  class StopWatch(TextStreamer):
 
70
  class HFLMWithMeasurement(HFLM):
71
  def __init__(self, **kwargs):
72
  super().__init__(**kwargs)
73
+ self.pretrained = kwargs.get("pretrained", None)
74
+ self.revision = kwargs.get("revision", None)
75
+ self.precision = kwargs.get("dtype", None)
76
 
77
  def _loglikelihood_tokens(
78
  self,
 
294
 
295
  return re_ord.get_original(res)
296
 
297
+ def _model_generate(self, context, max_tokens, stop, **generation_kwargs):
298
  # temperature = 0.0 if not set
299
  # if do_sample is false and temp==0.0:
300
  # remove temperature, as do_sample=False takes care of this
 
302
  generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
303
  do_sample = generation_kwargs.get("do_sample", None)
304
 
305
+ # is_gsm8k = generation_kwargs.get("is_gsm8k", False)
306
 
307
  # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
308
  if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
 
311
  if do_sample is False and generation_kwargs.get("temperature") == 0.0:
312
  generation_kwargs.pop("temperature")
313
 
314
+ # if is_gsm8k:
315
+ # generation_kwargs.pop("is_gsm8k")
316
+
317
+ context_length = context.shape[1]
318
 
319
+ if self.model.__class__.__name__ == "MoE":
320
+ model_config = self.model.model.config
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  else:
322
+ model_config = self.model.config
323
+
324
+ if not self.precision:
325
+ if model_config.quantization_config._load_in_4bit:
326
+ self.precision = "4bit"
327
+ elif model_config.quantization_config._load_in_8bit:
328
+ self.precision = "8bit"
329
+ else:
330
+ raise ValueError("Unknown precision")
331
+
332
+ # print(self.model)
333
+ linear_count = 0
334
+ element_wise_mul = 0
335
+ for name, module in self.model.named_modules():
336
+ if ('layers.0.' in name or 'decoder.0.' in name) and ('attn' not in name):
337
+ if 'experts.0.' in name:
338
+ if isinstance(module, torch.nn.Linear):
339
+ # print(name, module)
340
+ linear_count += 1
341
+ elif 'experts' not in name:
342
+ if "gate" not in name or "gate_proj" in name:
343
+ if "gate_proj" in name:
344
+ element_wise_mul = 1
345
+ if isinstance(module, torch.nn.Linear):
346
+ # print(name, module)
347
+ linear_count += 1
348
+ else:
349
+ continue
350
+ print(f"linear_count: {linear_count}")
351
+
352
+ stopping_criteria = stop_sequences_criteria(
353
+ self.tokenizer, stop, context.shape[1], context.shape[0]
354
+ )
355
+ stop_watch = StopWatch(self.tokenizer)
356
+ start = time()
357
+ res = self.model.generate(
358
+ input_ids=context,
359
+ max_new_tokens=max_tokens,
360
+ stopping_criteria=stopping_criteria,
361
+ pad_token_id=self.tokenizer.pad_token_id,
362
+ use_cache=True,
363
+ streamer=stop_watch,
364
+ **generation_kwargs,
365
+ )
366
+ end = time()
367
 
368
  batch_size = context.shape[0]
369
  output_length = stop_watch.decoding_iterations
370
+
371
+ precision_bytes = transfer_precision2bytes(self.precision)
372
+
373
+ model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
374
+ model_size_param = get_model_size(model_info=model_info, precision=self.precision)
375
+
376
+ n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else model_config.num_layers
377
+ d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
378
+
379
+ if hasattr(model_config, "num_experts_per_tok"):
380
+ n_experts_per_tok = model_config.num_experts_per_tok
381
+ elif hasattr(model_config, "num_selected_experts"):
382
+ n_experts_per_tok = model_config.num_selected_experts
383
+ else:
384
+ n_experts_per_tok = 1
385
+
386
+ if hasattr(model_config, "ffn_dim"):
387
+ d_ff = model_config.ffn_dim
388
+ elif hasattr(model_config, "intermediate_size"):
389
+ d_ff = model_config.intermediate_size
390
+ elif hasattr(model_config, "d_ff"):
391
+ d_ff = model_config.d_ff
392
+ else:
393
+ if hasattr(model_config, "ff_ratio"):
394
+ d_ff = d_model * model_config.ff_ratio
395
+ else:
396
+ raise ValueError("Unknown FFN dimension")
397
+
398
+ if hasattr(model_config, "num_local_experts"):
399
+ num_experts = model_config.num_local_experts
400
+ elif hasattr(model_config, "num_experts"):
401
+ num_experts = model_config.num_experts
402
+ else:
403
+ num_experts = 1
404
+
405
+ ffn_params = n_layers * d_ff * linear_count * d_model
406
+
407
+ shared_params = model_size_param * 1e9 - num_experts * ffn_params
408
+
409
+ model_size = shared_params + n_experts_per_tok * ffn_params
410
+
411
+ per_token_kv_size = 2 * n_layers * d_model * precision_bytes
412
+
413
+ peak_bw_single = get_peak_bw(get_gpu_details())
414
+ peak_bw = peak_bw_single * get_gpu_number()
415
+
416
+ context_prefill_size = context_length
417
+ kv_size = context_prefill_size * per_token_kv_size + (output_length - 1) * per_token_kv_size / 2
418
+
419
+ kv_size = kv_size / 1e9
420
+
421
+ n_vocab = model_config.vocab_size
422
 
423
  end_to_end_time = (end - start) / batch_size
424
  prefilling_time = stop_watch.prefilling_time / batch_size
425
  decoding_time = stop_watch.decoding_time / batch_size
426
  token_per_sec = output_length / decoding_time
427
+ achieve_mem_bw = (model_size * precision_bytes / 1e9 + kv_size) * token_per_sec
428
+
429
+ avg_context_length = context_length + (output_length - 1) / 2
430
+ flops_per_token = 2 * model_size + ((linear_count + element_wise_mul) * n_layers * avg_context_length * d_model) + 4 * d_model + 2 * d_model * n_vocab
431
+ peak_flops_single = get_peak_flops(get_gpu_details(), self.precision)
432
+ peak_flops = peak_flops_single * get_gpu_number()
433
+
434
+ ## TODO only support llama-type decoder only models and moe models of switch transformer and mixtrial
435
+ mfu = token_per_sec * flops_per_token / peak_flops
436
+ mbu = achieve_mem_bw / peak_bw
437
+
438
+ print(f"mfu: {mfu}, mbu: {mbu}")
439
+
440
+ return res, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu
441
 
442
  def generate_until(
443
  self, requests: List[Instance], disable_tqdm: bool = False
 
514
  f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
515
  )
516
  # add EOS token to stop sequences
517
+ eos = "<|eot_id|>"
518
  if not until:
519
  until = [eos]
520
  else:
521
  until.append(eos)
522
 
523
+ # is_gsm8k = kwargs.get("is_gsm8k", False)
524
+ # if is_gsm8k:
525
+ # until = ["Question:", "Question", "</s>"]
526
+ # eos_ids = [self.tokenizer.eos_token_id,
527
+ # self.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
528
+
529
 
530
  if "max_gen_toks" in kwargs.keys():
531
  max_gen_toks = kwargs.pop("max_gen_toks")
 
551
  context_enc = context_enc.to(self.device)
552
  attn_masks = attn_masks.to(self.device)
553
 
554
+ if "max_tokens" not in kwargs:
555
+ kwargs["max_tokens"] = max_gen_toks
556
 
557
  # perform batched generation
558
+ cont, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu = self._model_generate(
559
  context=context_enc,
560
  attention_mask=attn_masks,
561
  stop=until,
 
571
 
572
  s = self.tok_decode(cont_toks)
573
 
574
+ # # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
575
+ # if not is_gsm8k:
576
+ for term in until:
577
+ if len(term) > 0:
578
+ # ignore '' separator,
579
+ # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
580
+ s = s.split(term)[0]
581
+
582
+ # print(s)
583
+ res.append((s, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu))
584
 
585
  self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
586
  pbar.update(1)
open-moe-llm-leaderboard-gh/src/backend/moe_infinity.py CHANGED
@@ -31,8 +31,9 @@ class MoEHFLM(HFLMWithMeasurement):
31
  self.use_chat_template = use_chat_template
32
  if "device" in kwargs:
33
  kwargs.pop("device")
 
34
  super().__init__(
35
- *args, **kwargs, pretrained=pretrained, device_map="cuda:0"
36
  ) # Assuming HFLM accepts a 'pretrained' arg and handles it
37
  # self._create_model()
38
  shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
 
31
  self.use_chat_template = use_chat_template
32
  if "device" in kwargs:
33
  kwargs.pop("device")
34
+ kwargs["device_map"] = "cuda:0"
35
  super().__init__(
36
+ *args, **kwargs, pretrained=pretrained
37
  ) # Assuming HFLM accepts a 'pretrained' arg and handles it
38
  # self._create_model()
39
  shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
open-moe-llm-leaderboard-gh/src/backend/run_eval_suite.py CHANGED
@@ -17,12 +17,16 @@ def process_results_decorator(func):
17
  end_to_end_time = sum([r[1] for r in results]) / len(results)
18
  prefilling_time = sum([r[2] for r in results]) / len(results)
19
  decoding_throughput = sum([r[3] for r in results]) / len(results)
 
 
20
  # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
21
 
22
  result_dict = func(self, doc, processed_results, *args, **kwargs)
23
  result_dict["end_to_end_time"] = end_to_end_time
24
  result_dict["prefilling_time"] = prefilling_time
25
  result_dict["decoding_throughput"] = decoding_throughput
 
 
26
  return result_dict
27
  return wrapper
28
  ConfigurableTask.process_results = process_results_decorator(orig_process_results)
@@ -33,6 +37,8 @@ def aggregation_decorator(func):
33
  aggregation_list["end_to_end_time"] = mean
34
  aggregation_list["prefilling_time"] = mean
35
  aggregation_list["decoding_throughput"] = mean
 
 
36
  return aggregation_list
37
  return wrapper
38
  ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
@@ -43,6 +49,8 @@ def higher_is_better_decorator(func):
43
  higher_is_better_dict["end_to_end_time"] = False
44
  higher_is_better_dict["prefilling_time"] = False
45
  higher_is_better_dict["decoding_throughput"] = True
 
 
46
  return higher_is_better_dict
47
  return wrapper
48
  ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)
 
17
  end_to_end_time = sum([r[1] for r in results]) / len(results)
18
  prefilling_time = sum([r[2] for r in results]) / len(results)
19
  decoding_throughput = sum([r[3] for r in results]) / len(results)
20
+ mfu = sum([r[4] for r in results]) / len(results)
21
+ mbu = sum([r[5] for r in results]) / len(results)
22
  # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
23
 
24
  result_dict = func(self, doc, processed_results, *args, **kwargs)
25
  result_dict["end_to_end_time"] = end_to_end_time
26
  result_dict["prefilling_time"] = prefilling_time
27
  result_dict["decoding_throughput"] = decoding_throughput
28
+ result_dict["mfu"] = mfu * 100
29
+ result_dict["mbu"] = mbu * 100
30
  return result_dict
31
  return wrapper
32
  ConfigurableTask.process_results = process_results_decorator(orig_process_results)
 
37
  aggregation_list["end_to_end_time"] = mean
38
  aggregation_list["prefilling_time"] = mean
39
  aggregation_list["decoding_throughput"] = mean
40
+ aggregation_list["mfu"] = mean
41
+ aggregation_list["mbu"] = mean
42
  return aggregation_list
43
  return wrapper
44
  ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
 
49
  higher_is_better_dict["end_to_end_time"] = False
50
  higher_is_better_dict["prefilling_time"] = False
51
  higher_is_better_dict["decoding_throughput"] = True
52
+ higher_is_better_dict["mfu"] = True
53
+ higher_is_better_dict["mbu"] = True
54
  return higher_is_better_dict
55
  return wrapper
56
  ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)
open-moe-llm-leaderboard-gh/src/backend/tasks/gsm8k/gsm8k-custom.yaml CHANGED
@@ -22,18 +22,21 @@ metric_list:
22
  - "\\.$"
23
  generation_kwargs:
24
  until:
25
- - "<|eot_id|>"
 
 
 
26
  do_sample: false
27
  temperature: 0.0
28
- is_gsm8k: true
29
  repeats: 1
30
  num_fewshot: 5
31
  filter_list:
32
- # - name: "strict-match"
33
- # filter:
34
- # - function: "regex"
35
- # regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
36
- # - function: "take_first"
37
  - name: "flexible-extract"
38
  filter:
39
  - function: "regex"
 
22
  - "\\.$"
23
  generation_kwargs:
24
  until:
25
+ - "Question:"
26
+ - "Question"
27
+ - "</s>"
28
+ - "<|im_end|>"
29
  do_sample: false
30
  temperature: 0.0
31
+ # is_gsm8k: true
32
  repeats: 1
33
  num_fewshot: 5
34
  filter_list:
35
+ - name: "strict-match"
36
+ filter:
37
+ - function: "regex"
38
+ regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
39
+ - function: "take_first"
40
  - name: "flexible-extract"
41
  filter:
42
  - function: "regex"
open-moe-llm-leaderboard-gh/src/backend/tasks/measurement_task_utils.py CHANGED
@@ -12,6 +12,9 @@ def process_results_decorator(func):
12
  end_to_end_time = sum([r[1] for r in results]) / len(results)
13
  prefilling_time = sum([r[2] for r in results]) / len(results)
14
  decoding_throughput = sum([r[3] for r in results]) / len(results)
 
 
 
15
  # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
16
 
17
  # Now call the original process_results with the processed results
@@ -19,6 +22,8 @@ def process_results_decorator(func):
19
  result_dict["end_to_end_time"] = end_to_end_time
20
  result_dict["prefilling_time"] = prefilling_time
21
  result_dict["decoding_throughput"] = decoding_throughput
 
 
22
  return result_dict
23
  return wrapper
24
 
@@ -30,6 +35,8 @@ def aggregation_decorator(func):
30
  aggregation_list["end_to_end_time"] = mean
31
  aggregation_list["prefilling_time"] = mean
32
  aggregation_list["decoding_throughput"] = mean
 
 
33
  return aggregation_list
34
  return wrapper
35
 
@@ -41,6 +48,8 @@ def higher_is_better_decorator(func):
41
  higher_is_better_dict["end_to_end_time"] = False
42
  higher_is_better_dict["prefilling_time"] = False
43
  higher_is_better_dict["decoding_throughput"] = True
 
 
44
  return higher_is_better_dict
45
  return wrapper
46
 
 
12
  end_to_end_time = sum([r[1] for r in results]) / len(results)
13
  prefilling_time = sum([r[2] for r in results]) / len(results)
14
  decoding_throughput = sum([r[3] for r in results]) / len(results)
15
+ mfu = sum([r[4] for r in results]) / len(results)
16
+ mbu = sum([r[5] for r in results]) / len(results)
17
+
18
  # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
19
 
20
  # Now call the original process_results with the processed results
 
22
  result_dict["end_to_end_time"] = end_to_end_time
23
  result_dict["prefilling_time"] = prefilling_time
24
  result_dict["decoding_throughput"] = decoding_throughput
25
+ result_dict["mfu"] = mfu
26
+ result_dict["mbu"] = mbu
27
  return result_dict
28
  return wrapper
29
 
 
35
  aggregation_list["end_to_end_time"] = mean
36
  aggregation_list["prefilling_time"] = mean
37
  aggregation_list["decoding_throughput"] = mean
38
+ aggregation_list["mfu"] = mean
39
+ aggregation_list["mbu"] = mean
40
  return aggregation_list
41
  return wrapper
42
 
 
48
  higher_is_better_dict["end_to_end_time"] = False
49
  higher_is_better_dict["prefilling_time"] = False
50
  higher_is_better_dict["decoding_throughput"] = True
51
+ higher_is_better_dict["mfu"] = True
52
+ higher_is_better_dict["mbu"] = True
53
  return higher_is_better_dict
54
  return wrapper
55
 
open-moe-llm-leaderboard-gh/src/display/utils.py CHANGED
@@ -18,12 +18,16 @@ GPU_Power = 'Power(W)'
18
  GPU_Mem = 'Mem(G)'
19
  GPU_Name = "GPU"
20
  GPU_Util = 'Util(%)'
 
 
21
  BATCH_SIZE = 'bs'
22
  PRECISION = "Precision"
23
  system_metrics_to_name_map = {
24
  "end_to_end_time": f"{E2Es}",
25
  "prefilling_time": f"{PREs}",
26
  "decoding_throughput": f"{TS}",
 
 
27
  }
28
 
29
  gpu_metrics_to_name_map = {
@@ -75,7 +79,7 @@ class Tasks(Enum):
75
  # # XXX include me back at some point
76
  selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
77
  mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
78
- gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (8-shot)
79
 
80
 
81
  # These classes are for user facing column names,
 
18
  GPU_Mem = 'Mem(G)'
19
  GPU_Name = "GPU"
20
  GPU_Util = 'Util(%)'
21
+ MFU = 'MFU(%)'
22
+ MBU = 'MBU(%)'
23
  BATCH_SIZE = 'bs'
24
  PRECISION = "Precision"
25
  system_metrics_to_name_map = {
26
  "end_to_end_time": f"{E2Es}",
27
  "prefilling_time": f"{PREs}",
28
  "decoding_throughput": f"{TS}",
29
+ "mfu": f"{MFU}",
30
+ "mbu": f"{MBU}"
31
  }
32
 
33
  gpu_metrics_to_name_map = {
 
79
  # # XXX include me back at some point
80
  selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
81
  mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
82
+ gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot)
83
 
84
 
85
  # These classes are for user facing column names,
open-moe-llm-leaderboard-gh/src/submission/check_validity.py CHANGED
@@ -74,7 +74,7 @@ def is_model_on_hub(
74
 
75
 
76
  def get_model_size(model_info: ModelInfo, precision: str):
77
- size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
78
  try:
79
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
80
  except (AttributeError, TypeError):
 
74
 
75
 
76
  def get_model_size(model_info: ModelInfo, precision: str):
77
+ size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
78
  try:
79
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
80
  except (AttributeError, TypeError):
open-moe-llm-leaderboard-gh/src/utils.py CHANGED
@@ -31,6 +31,12 @@ PEAK_FLOPS_DICT = {
31
  "NVIDIA-H100-PCIe-80GB": 1513e12,
32
  "NVIDIA-RTX-A5000-24GB": 444.4e12
33
  },
 
 
 
 
 
 
34
  "8bit":{
35
  "NVIDIA-A100-PCIe-80GB": 1248e12,
36
  "NVIDIA-A100-SXM-80GB": 1248e12,
@@ -92,7 +98,8 @@ def parse_nvidia_smi():
92
  gpu_stats = []
93
 
94
  gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
95
- gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
 
96
 
97
  gpu_name = ""
98
  for index in gpu_indices:
@@ -104,7 +111,7 @@ def parse_nvidia_smi():
104
  name_match = gpu_name_pattern.search(line)
105
  gpu_info = {}
106
  if name_match:
107
- gpu_name = name_match.group(1).strip()
108
  if match:
109
  temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
110
  gpu_info.update({
@@ -208,10 +215,15 @@ def get_gpu_details():
208
  gpus = GPUtil.getGPUs()
209
  gpu = gpus[0]
210
  name = gpu.name.replace(" ", "-")
211
- # Convert memory from MB to GB and round to nearest whole number
212
  memory_gb = round(gpu.memoryTotal / 1024)
213
  memory = f"{memory_gb}GB"
 
 
 
 
 
214
  formatted_name = f"{name}-{memory}"
 
215
  return formatted_name
216
 
217
  def get_peak_bw(gpu_name):
@@ -223,7 +235,7 @@ def get_peak_flops(gpu_name, precision):
223
  def transfer_precision2bytes(precision):
224
  if precision == "float32":
225
  return 4
226
- elif precision == "float16":
227
  return 2
228
  elif precision == "8bit":
229
  return 1
 
31
  "NVIDIA-H100-PCIe-80GB": 1513e12,
32
  "NVIDIA-RTX-A5000-24GB": 444.4e12
33
  },
34
+ "bfloat16":{
35
+ "NVIDIA-A100-PCIe-80GB": 624e12,
36
+ "NVIDIA-A100-SXM-80GB": 624e12,
37
+ "NVIDIA-H100-PCIe-80GB": 1513e12,
38
+ "NVIDIA-RTX-A5000-24GB": 444.4e12
39
+ },
40
  "8bit":{
41
  "NVIDIA-A100-PCIe-80GB": 1248e12,
42
  "NVIDIA-A100-SXM-80GB": 1248e12,
 
98
  gpu_stats = []
99
 
100
  gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
101
+ # gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
102
+ gpu_name_pattern = re.compile(r'NVIDIA\s+(RTX\s+)?([A-Z0-9]+)')
103
 
104
  gpu_name = ""
105
  for index in gpu_indices:
 
111
  name_match = gpu_name_pattern.search(line)
112
  gpu_info = {}
113
  if name_match:
114
+ gpu_name = ''.join(filter(None, name_match.groups())).strip()
115
  if match:
116
  temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
117
  gpu_info.update({
 
215
  gpus = GPUtil.getGPUs()
216
  gpu = gpus[0]
217
  name = gpu.name.replace(" ", "-")
 
218
  memory_gb = round(gpu.memoryTotal / 1024)
219
  memory = f"{memory_gb}GB"
220
+
221
+ for part in name.split('-'):
222
+ if part.endswith("GB") and part[:-2].isdigit():
223
+ name = name.replace(f"-{part}", "").replace(part, "")
224
+
225
  formatted_name = f"{name}-{memory}"
226
+
227
  return formatted_name
228
 
229
  def get_peak_bw(gpu_name):
 
235
  def transfer_precision2bytes(precision):
236
  if precision == "float32":
237
  return 4
238
+ elif precision in ["float16", "bfloat16"]:
239
  return 2
240
  elif precision == "8bit":
241
  return 1