nadsaa commited on
Commit
d83f3a1
·
1 Parent(s): 3abe9fa

multilingual results

Browse files
app.py CHANGED
@@ -31,14 +31,14 @@ from src.display.utils import (
31
  MEDICAL_SUMMARIZATION_BENCHMARK_COLS,
32
  ACI_BENCHMARK_COLS,
33
  SOAP_BENCHMARK_COLS,
34
- CLOSED_ENDED_ARABIC_BENCHMARK_COLS,
35
  DATASET_COLS,
36
  OPEN_ENDED_COLS,
37
  MED_SAFETY_COLS,
38
  MEDICAL_SUMMARIZATION_COLS,
39
  ACI_COLS,
40
  SOAP_COLS,
41
- CLOSED_ENDED_ARABIC_COLS,
42
  EVAL_COLS,
43
  EVAL_TYPES,
44
  NUMERIC_INTERVALS,
@@ -50,7 +50,23 @@ from src.display.utils import (
50
  Precision,
51
  WeightType,
52
  fields,
53
- render_generation_templates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  )
55
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, PRIVATE_REPO
56
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
@@ -96,9 +112,28 @@ aci_leaderboard_df = aci_original_df.copy()
96
  _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
97
  soap_leaderboard_df = soap_original_df.copy()
98
 
99
- if PRIVATE_REPO:
100
- _, closed_ended_arabic_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, CLOSED_ENDED_ARABIC_COLS, CLOSED_ENDED_ARABIC_BENCHMARK_COLS, "score", "closed_ended_arabic")
101
- closed_ended_arabic_leaderboard_df = closed_ended_arabic_original_df.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  # breakpoint()
104
  # # Token based results
@@ -136,9 +171,28 @@ def update_df(shown_columns, subset="datasets"):
136
  elif subset == "soap":
137
  leaderboard_table_df = soap_leaderboard_df.copy()
138
  hidden_leader_board_df = soap_original_df
139
- elif PRIVATE_REPO and subset == "closed-ended-arabic":
140
- leaderboard_table_df = closed_ended_arabic_leaderboard_df.copy()
141
- hidden_leader_board_df = closed_ended_arabic_original_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  # else:
143
  # match evaluation_metric:
144
  # case "Span Based":
@@ -258,128 +312,140 @@ def filter_models(
258
  demo = gr.Blocks(css=custom_css)
259
  with demo:
260
  print("hello")
261
- if PRIVATE_REPO:
262
- gr.HTML(TITLE)
263
  gr.HTML(LOGO)
264
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
 
 
265
 
266
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
267
- with gr.TabItem("🏅 Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
268
- with gr.Row():
269
- with gr.Column():
270
- with gr.Row():
271
- search_bar = gr.Textbox(
272
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
273
- show_label=False,
274
- elem_id="search-bar",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  )
276
- with gr.Row():
277
- shown_columns = gr.CheckboxGroup(
278
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)],
279
- value=[
280
- c.name
281
- for c in fields(AutoEvalColumn)
282
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)
 
 
 
 
 
 
 
 
 
 
283
  ],
284
- label="Select columns to show",
285
- elem_id="column-select",
286
- interactive=True,
287
  )
288
- # with gr.Row():
289
- # deleted_models_visibility = gr.Checkbox(
290
- # value=False, label="Show gated/private/deleted models", interactive=True
291
- # )
292
- with gr.Column(min_width=320):
293
- # with gr.Box(elem_id="box-filter"):
294
- filter_columns_type = gr.CheckboxGroup(
295
- label="Model Types",
296
- choices=[t.to_str() for t in ModelType],
297
- value=[t.to_str() for t in ModelType],
298
- interactive=True,
299
- elem_id="filter-columns-type",
300
- )
301
- # filter_columns_architecture = gr.CheckboxGroup(
302
- # label="Architecture Types",
303
- # choices=[i.value.name for i in ModelArch],
304
- # value=[i.value.name for i in ModelArch],
305
- # interactive=True,
306
- # elem_id="filter-columns-architecture",
307
- # )
308
- filter_domain_specific = gr.CheckboxGroup(
309
- label="Domain Specificity",
310
- choices=["🏥 Clinical models", "Generic models"],
311
- value=["🏥 Clinical models", "Generic models"],
312
- interactive=True,
313
- elem_id="filter-columns-type",
314
- )
315
- filter_columns_size = gr.CheckboxGroup(
316
- label="Model sizes (in billions of parameters)",
317
- choices=list(NUMERIC_INTERVALS.keys()),
318
- value=list(NUMERIC_INTERVALS.keys()),
319
- interactive=True,
320
- elem_id="filter-columns-size",
321
- )
322
 
323
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="open_ended")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
- leaderboard_table = gr.components.Dataframe(
326
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
327
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
328
- datatype=TYPES,
329
- elem_id="leaderboard-table",
330
- interactive=False,
331
- visible=True,
332
- )
333
 
334
- # Dummy leaderboard for handling the case when the user uses backspace key
335
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
336
- value=datasets_original_df[OPEN_ENDED_COLS],
337
- headers=OPEN_ENDED_COLS,
338
- datatype=TYPES,
339
- visible=False,
340
- )
341
 
342
-
343
- search_bar.submit(
344
- update_table,
345
- [
346
- hidden_leaderboard_table_for_search,
347
- shown_columns,
348
- search_bar,
349
- filter_columns_type,
350
- filter_domain_specific,
351
- filter_columns_size
352
- # filter_columns_architecture
353
- ],
354
- leaderboard_table,
355
- )
356
- for selector in [
357
- shown_columns,
358
- filter_columns_type,
359
- filter_domain_specific,
360
- # filter_columns_architecture,
361
- filter_columns_size,
362
- # deleted_models_visibility,
363
- ]:
364
- selector.change(
365
- update_table,
366
- [
367
- hidden_leaderboard_table_for_search,
368
- shown_columns,
369
- search_bar,
370
- filter_columns_type,
371
- filter_domain_specific,
372
- filter_columns_size
373
- # filter_columns_architecture,
374
- ],
375
- leaderboard_table,
376
- queue=True,
377
- )
378
- with gr.Accordion("💬 Generation templates", open=False):
379
- with gr.Accordion("Response generation", open=False):
380
- system_prompt, user_prompt = render_generation_templates(task="open_ended", generation_type="response_generation")
381
- with gr.Accordion("Scoring Rubric", open=False):
382
- system_prompt, user_prompt = render_generation_templates(task="open_ended", generation_type="scoring_rubric")
383
  with gr.TabItem("🏅 Med Safety", elem_id="llm-benchmark-tab-table", id=2):
384
  with gr.Row():
385
  with gr.Column():
@@ -387,7 +453,7 @@ with demo:
387
  search_bar = gr.Textbox(
388
  placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
389
  show_label=False,
390
- elem_id="search-bar",
391
  )
392
  with gr.Row():
393
  shown_columns = gr.CheckboxGroup(
@@ -398,64 +464,50 @@ with demo:
398
  if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)
399
  ],
400
  label="Select columns to show",
401
- elem_id="column-select",
402
  interactive=True,
403
  )
404
- # with gr.Row():
405
- # deleted_models_visibility = gr.Checkbox(
406
- # value=False, label="Show gated/private/deleted models", interactive=True
407
- # )
408
  with gr.Column(min_width=320):
409
- # with gr.Box(elem_id="box-filter"):
410
  filter_columns_type = gr.CheckboxGroup(
411
  label="Model Types",
412
  choices=[t.to_str() for t in ModelType],
413
  value=[t.to_str() for t in ModelType],
414
  interactive=True,
415
- elem_id="filter-columns-type",
416
  )
417
- # filter_columns_architecture = gr.CheckboxGroup(
418
- # label="Architecture Types",
419
- # choices=[i.value.name for i in ModelArch],
420
- # value=[i.value.name for i in ModelArch],
421
- # interactive=True,
422
- # elem_id="filter-columns-architecture",
423
- # )
424
  filter_domain_specific = gr.CheckboxGroup(
425
  label="Domain Specificity",
426
  choices=["🏥 Clinical models", "Generic models"],
427
  value=["🏥 Clinical models", "Generic models"],
428
  interactive=True,
429
- elem_id="filter-columns-type",
430
  )
431
  filter_columns_size = gr.CheckboxGroup(
432
  label="Model sizes (in billions of parameters)",
433
  choices=list(NUMERIC_INTERVALS.keys()),
434
  value=list(NUMERIC_INTERVALS.keys()),
435
  interactive=True,
436
- elem_id="filter-columns-size",
437
  )
438
 
439
  datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="med_safety")
440
 
441
- leaderboard_table = gr.components.Dataframe(
442
  value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
443
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
444
  datatype=TYPES,
445
- elem_id="leaderboard-table",
446
  interactive=False,
447
  visible=True,
448
  )
449
 
450
- # Dummy leaderboard for handling the case when the user uses backspace key
451
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
452
  value=datasets_original_df[MED_SAFETY_COLS],
453
  headers=MED_SAFETY_COLS,
454
  datatype=TYPES,
455
  visible=False,
456
  )
457
-
458
-
459
  search_bar.submit(
460
  update_table,
461
  [
@@ -465,16 +517,15 @@ with demo:
465
  filter_columns_type,
466
  filter_domain_specific,
467
  filter_columns_size
468
- # filter_columns_architecture
469
  ],
470
  leaderboard_table,
471
  )
 
472
  for selector in [
473
  shown_columns,
474
  filter_columns_type,
475
  filter_domain_specific,
476
  filter_columns_size,
477
- # deleted_models_visibility,
478
  ]:
479
  selector.change(
480
  update_table,
@@ -489,11 +540,13 @@ with demo:
489
  leaderboard_table,
490
  queue=True,
491
  )
 
492
  with gr.Accordion("💬 Generation templates", open=False):
493
  with gr.Accordion("Response generation", open=False):
494
  system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="response_generation")
495
  with gr.Accordion("Scoring Rubric", open=False):
496
  system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="scoring_rubric")
 
497
  with gr.TabItem("🏅 Medical Summarization", elem_id="llm-benchmark-tab-table", id=3):
498
  gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
499
  with gr.Row():
@@ -502,7 +555,7 @@ with demo:
502
  search_bar = gr.Textbox(
503
  placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
504
  show_label=False,
505
- elem_id="search-bar",
506
  )
507
  with gr.Row():
508
  shown_columns = gr.CheckboxGroup(
@@ -513,64 +566,50 @@ with demo:
513
  if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)
514
  ],
515
  label="Select columns to show",
516
- elem_id="column-select",
517
  interactive=True,
518
  )
519
- # with gr.Row():
520
- # deleted_models_visibility = gr.Checkbox(
521
- # value=False, label="Show gated/private/deleted models", interactive=True
522
- # )
523
  with gr.Column(min_width=320):
524
- # with gr.Box(elem_id="box-filter"):
525
  filter_columns_type = gr.CheckboxGroup(
526
  label="Model Types",
527
  choices=[t.to_str() for t in ModelType],
528
  value=[t.to_str() for t in ModelType],
529
  interactive=True,
530
- elem_id="filter-columns-type",
531
  )
532
- # filter_columns_architecture = gr.CheckboxGroup(
533
- # label="Architecture Types",
534
- # choices=[i.value.name for i in ModelArch],
535
- # value=[i.value.name for i in ModelArch],
536
- # interactive=True,
537
- # elem_id="filter-columns-architecture",
538
- # )
539
  filter_domain_specific = gr.CheckboxGroup(
540
  label="Domain Specificity",
541
  choices=["🏥 Clinical models", "Generic models"],
542
  value=["🏥 Clinical models", "Generic models"],
543
  interactive=True,
544
- elem_id="filter-columns-type",
545
  )
546
  filter_columns_size = gr.CheckboxGroup(
547
  label="Model sizes (in billions of parameters)",
548
  choices=list(NUMERIC_INTERVALS.keys()),
549
  value=list(NUMERIC_INTERVALS.keys()),
550
  interactive=True,
551
- elem_id="filter-columns-size",
552
  )
553
 
554
  datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="medical_summarization")
555
 
556
- leaderboard_table = gr.components.Dataframe(
557
  value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
558
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
559
  datatype=TYPES,
560
- elem_id="leaderboard-table",
561
  interactive=False,
562
  visible=True,
563
  )
564
 
565
- # Dummy leaderboard for handling the case when the user uses backspace key
566
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
567
  value=datasets_original_df[MEDICAL_SUMMARIZATION_COLS],
568
  headers=MEDICAL_SUMMARIZATION_COLS,
569
  datatype=TYPES,
570
  visible=False,
571
  )
572
-
573
-
574
  search_bar.submit(
575
  update_table,
576
  [
@@ -580,16 +619,15 @@ with demo:
580
  filter_columns_type,
581
  filter_domain_specific,
582
  filter_columns_size
583
- # filter_columns_architecture
584
  ],
585
  leaderboard_table,
586
  )
 
587
  for selector in [
588
  shown_columns,
589
  filter_columns_type,
590
  filter_domain_specific,
591
  filter_columns_size,
592
- # deleted_models_visibility,
593
  ]:
594
  selector.change(
595
  update_table,
@@ -604,24 +642,26 @@ with demo:
604
  leaderboard_table,
605
  queue=True,
606
  )
 
607
  with gr.Accordion("💬 Generation templates", open=False):
608
  with gr.Accordion("Response generation", open=False):
609
  system_prompt, user_prompt = render_generation_templates(task="medical_summarization", generation_type="response_generation")
610
  with gr.Accordion("Question generation", open=False):
611
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
612
  with gr.Accordion("Cross Examination", open=False):
613
- system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
 
614
  with gr.TabItem("🏅 Note generation", elem_id="llm-benchmark-tab-table", id=4):
615
  gr.Markdown(NOTE_GENERATION_METRICS, elem_classes="markdown-text")
616
- with gr.Tabs(elem_classes="tab-buttons2") as tabs:
617
- with gr.TabItem("ACI Bench", elem_id="llm-benchmark-tab-table2", id=0):
618
  with gr.Row():
619
  with gr.Column():
620
  with gr.Row():
621
  search_bar = gr.Textbox(
622
  placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
623
  show_label=False,
624
- elem_id="search-bar",
625
  )
626
  with gr.Row():
627
  shown_columns = gr.CheckboxGroup(
@@ -632,64 +672,50 @@ with demo:
632
  if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)
633
  ],
634
  label="Select columns to show",
635
- elem_id="column-select",
636
  interactive=True,
637
  )
638
- # with gr.Row():
639
- # deleted_models_visibility = gr.Checkbox(
640
- # value=False, label="Show gated/private/deleted models", interactive=True
641
- # )
642
  with gr.Column(min_width=320):
643
- # with gr.Box(elem_id="box-filter"):
644
  filter_columns_type = gr.CheckboxGroup(
645
  label="Model Types",
646
  choices=[t.to_str() for t in ModelType],
647
  value=[t.to_str() for t in ModelType],
648
  interactive=True,
649
- elem_id="filter-columns-type",
650
  )
651
- # filter_columns_architecture = gr.CheckboxGroup(
652
- # label="Architecture Types",
653
- # choices=[i.value.name for i in ModelArch],
654
- # value=[i.value.name for i in ModelArch],
655
- # interactive=True,
656
- # elem_id="filter-columns-architecture",
657
- # )
658
  filter_domain_specific = gr.CheckboxGroup(
659
  label="Domain Specificity",
660
  choices=["🏥 Clinical models", "Generic models"],
661
  value=["🏥 Clinical models", "Generic models"],
662
  interactive=True,
663
- elem_id="filter-columns-type",
664
  )
665
  filter_columns_size = gr.CheckboxGroup(
666
  label="Model sizes (in billions of parameters)",
667
  choices=list(NUMERIC_INTERVALS.keys()),
668
  value=list(NUMERIC_INTERVALS.keys()),
669
  interactive=True,
670
- elem_id="filter-columns-size",
671
  )
672
 
673
  datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="aci")
674
 
675
- leaderboard_table = gr.components.Dataframe(
676
  value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
677
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
678
  datatype=TYPES,
679
- elem_id="leaderboard-table",
680
  interactive=False,
681
  visible=True,
682
  )
683
 
684
- # Dummy leaderboard for handling the case when the user uses backspace key
685
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
686
  value=datasets_original_df[ACI_COLS],
687
  headers=ACI_COLS,
688
  datatype=TYPES,
689
  visible=False,
690
  )
691
-
692
-
693
  search_bar.submit(
694
  update_table,
695
  [
@@ -699,16 +725,15 @@ with demo:
699
  filter_columns_type,
700
  filter_domain_specific,
701
  filter_columns_size
702
- # filter_columns_architecture
703
  ],
704
  leaderboard_table,
705
  )
 
706
  for selector in [
707
  shown_columns,
708
  filter_columns_type,
709
  filter_domain_specific,
710
  filter_columns_size,
711
- # deleted_models_visibility,
712
  ]:
713
  selector.change(
714
  update_table,
@@ -723,14 +748,15 @@ with demo:
723
  leaderboard_table,
724
  queue=True,
725
  )
726
- with gr.TabItem("SOAP Notes", elem_id="llm-benchmark-tab-table2", id=1):
 
727
  with gr.Row():
728
  with gr.Column():
729
  with gr.Row():
730
  search_bar = gr.Textbox(
731
  placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
732
  show_label=False,
733
- elem_id="search-bar",
734
  )
735
  with gr.Row():
736
  shown_columns = gr.CheckboxGroup(
@@ -741,64 +767,50 @@ with demo:
741
  if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)
742
  ],
743
  label="Select columns to show",
744
- elem_id="column-select",
745
  interactive=True,
746
  )
747
- # with gr.Row():
748
- # deleted_models_visibility = gr.Checkbox(
749
- # value=False, label="Show gated/private/deleted models", interactive=True
750
- # )
751
  with gr.Column(min_width=320):
752
- # with gr.Box(elem_id="box-filter"):
753
  filter_columns_type = gr.CheckboxGroup(
754
  label="Model Types",
755
  choices=[t.to_str() for t in ModelType],
756
  value=[t.to_str() for t in ModelType],
757
  interactive=True,
758
- elem_id="filter-columns-type",
759
  )
760
- # filter_columns_architecture = gr.CheckboxGroup(
761
- # label="Architecture Types",
762
- # choices=[i.value.name for i in ModelArch],
763
- # value=[i.value.name for i in ModelArch],
764
- # interactive=True,
765
- # elem_id="filter-columns-architecture",
766
- # )
767
  filter_domain_specific = gr.CheckboxGroup(
768
  label="Domain Specificity",
769
  choices=["🏥 Clinical models", "Generic models"],
770
  value=["🏥 Clinical models", "Generic models"],
771
  interactive=True,
772
- elem_id="filter-columns-type",
773
  )
774
  filter_columns_size = gr.CheckboxGroup(
775
  label="Model sizes (in billions of parameters)",
776
  choices=list(NUMERIC_INTERVALS.keys()),
777
  value=list(NUMERIC_INTERVALS.keys()),
778
  interactive=True,
779
- elem_id="filter-columns-size",
780
  )
781
 
782
  datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="soap")
783
 
784
- leaderboard_table = gr.components.Dataframe(
785
  value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
786
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
787
  datatype=TYPES,
788
- elem_id="leaderboard-table",
789
  interactive=False,
790
  visible=True,
791
  )
792
 
793
- # Dummy leaderboard for handling the case when the user uses backspace key
794
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
795
  value=datasets_original_df[SOAP_COLS],
796
  headers=SOAP_COLS,
797
  datatype=TYPES,
798
  visible=False,
799
  )
800
-
801
-
802
  search_bar.submit(
803
  update_table,
804
  [
@@ -808,16 +820,15 @@ with demo:
808
  filter_columns_type,
809
  filter_domain_specific,
810
  filter_columns_size
811
- # filter_columns_architecture
812
  ],
813
  leaderboard_table,
814
  )
 
815
  for selector in [
816
  shown_columns,
817
  filter_columns_type,
818
  filter_domain_specific,
819
  filter_columns_size,
820
- # deleted_models_visibility,
821
  ]:
822
  selector.change(
823
  update_table,
@@ -832,6 +843,7 @@ with demo:
832
  leaderboard_table,
833
  queue=True,
834
  )
 
835
  with gr.Accordion("💬 Generation templates", open=False):
836
  with gr.Accordion("ACI-Bench Response generation", open=False):
837
  system_prompt, user_prompt = render_generation_templates(task="aci", generation_type="response_generation")
@@ -840,87 +852,93 @@ with demo:
840
  with gr.Accordion("Question generation", open=False):
841
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
842
  with gr.Accordion("Cross Examination", open=False):
843
- system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
844
- if PRIVATE_REPO:
845
- with gr.TabItem("Dev Evals", elem_id="llm-benchmark-tab-table", id=100):
846
- with gr.Tabs(elem_classes="tab-buttons2") as tabs:
847
- with gr.TabItem("🏅 Arabic Closed Ended Evaluation", elem_id="llm-benchmark-tab-table100", id=0):
848
- with gr.Row():
849
- with gr.Column():
850
- with gr.Row():
851
- search_bar = gr.Textbox(
852
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
853
- show_label=False,
854
- elem_id="search-bar",
855
- )
856
- with gr.Row():
857
- shown_columns = gr.CheckboxGroup(
858
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_arabic_col)],
859
- value=[
860
- c.name
861
- for c in fields(AutoEvalColumn)
862
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_arabic_col)
863
- ],
864
- label="Select columns to show",
865
- elem_id="column-select",
866
- interactive=True,
867
- )
868
- # with gr.Row():
869
- # deleted_models_visibility = gr.Checkbox(
870
- # value=False, label="Show gated/private/deleted models", interactive=True
871
- # )
872
- with gr.Column(min_width=320):
873
- # with gr.Box(elem_id="box-filter"):
874
- filter_columns_type = gr.CheckboxGroup(
875
- label="Model Types",
876
- choices=[t.to_str() for t in ModelType],
877
- value=[t.to_str() for t in ModelType],
878
- interactive=True,
879
- elem_id="filter-columns-type",
880
- )
881
- # filter_columns_architecture = gr.CheckboxGroup(
882
- # label="Architecture Types",
883
- # choices=[i.value.name for i in ModelArch],
884
- # value=[i.value.name for i in ModelArch],
885
- # interactive=True,
886
- # elem_id="filter-columns-architecture",
887
- # )
888
- filter_domain_specific = gr.CheckboxGroup(
889
- label="Domain Specificity",
890
- choices=["🏥 Clinical models", "Generic models"],
891
- value=["🏥 Clinical models", "Generic models"],
892
- interactive=True,
893
- elem_id="filter-columns-type",
894
  )
895
- filter_columns_size = gr.CheckboxGroup(
896
- label="Model sizes (in billions of parameters)",
897
- choices=list(NUMERIC_INTERVALS.keys()),
898
- value=list(NUMERIC_INTERVALS.keys()),
 
 
 
 
 
 
899
  interactive=True,
900
- elem_id="filter-columns-size",
901
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902
 
903
- closed_ended_arabic_leaderboard_df, closed_ended_arabic_original_df = update_df(shown_columns.value, subset="closed-ended-arabic")
904
-
905
- leaderboard_table = gr.components.Dataframe(
906
- value=closed_ended_arabic_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
907
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
908
- datatype=TYPES,
909
- elem_id="leaderboard-table",
910
- interactive=False,
911
- visible=True,
912
- )
913
-
914
- # Dummy leaderboard for handling the case when the user uses backspace key
915
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
916
- value=closed_ended_arabic_original_df[CLOSED_ENDED_ARABIC_COLS],
917
- headers=CLOSED_ENDED_ARABIC_COLS,
918
- datatype=TYPES,
919
- visible=False,
920
- )
921
-
922
-
923
- search_bar.submit(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
924
  update_table,
925
  [
926
  hidden_leaderboard_table_for_search,
@@ -929,256 +947,114 @@ with demo:
929
  filter_columns_type,
930
  filter_domain_specific,
931
  filter_columns_size
932
- # filter_columns_architecture
933
  ],
934
  leaderboard_table,
 
935
  )
936
- for selector in [
937
- shown_columns,
938
- filter_columns_type,
939
- filter_domain_specific,
940
- # filter_columns_architecture,
941
- filter_columns_size,
942
- # deleted_models_visibility,
943
- ]:
944
- selector.change(
945
- update_table,
946
- [
947
- hidden_leaderboard_table_for_search,
948
- shown_columns,
949
- search_bar,
950
- filter_columns_type,
951
- filter_domain_specific,
952
- filter_columns_size
953
- # filter_columns_architecture,
954
- ],
955
- leaderboard_table,
956
- queue=True,
957
- )
958
- with gr.TabItem("🏅 Closed Ended Evaluation", elem_id="llm-benchmark-tab-table", id=0):
959
- with gr.Row():
960
- with gr.Column():
961
  with gr.Row():
962
- search_bar = gr.Textbox(
963
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
964
- show_label=False,
965
- elem_id="search-bar",
966
- )
967
  with gr.Row():
968
- shown_columns = gr.CheckboxGroup(
969
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
970
- value=[
971
- c.name
972
- for c in fields(AutoEvalColumn)
973
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)
974
- ],
975
- label="Select columns to show",
976
- elem_id="column-select",
977
- interactive=True,
978
- )
979
- # with gr.Row():
980
- # deleted_models_visibility = gr.Checkbox(
981
- # value=False, label="Show gated/private/deleted models", interactive=True
982
- # )
983
- with gr.Column(min_width=320):
984
- # with gr.Box(elem_id="box-filter"):
985
- filter_columns_type = gr.CheckboxGroup(
986
- label="Model Types",
987
- choices=[t.to_str() for t in ModelType],
988
- value=[t.to_str() for t in ModelType],
989
- interactive=True,
990
- elem_id="filter-columns-type",
991
- )
992
- # filter_columns_architecture = gr.CheckboxGroup(
993
- # label="Architecture Types",
994
- # choices=[i.value.name for i in ModelArch],
995
- # value=[i.value.name for i in ModelArch],
996
- # interactive=True,
997
- # elem_id="filter-columns-architecture",
998
- # )
999
- filter_domain_specific = gr.CheckboxGroup(
1000
- label="Domain Specificity",
1001
- choices=["🏥 Clinical models", "Generic models"],
1002
- value=["🏥 Clinical models", "Generic models"],
1003
- interactive=True,
1004
- elem_id="filter-columns-type",
1005
- )
1006
- filter_columns_size = gr.CheckboxGroup(
1007
- label="Model sizes (in billions of parameters)",
1008
- choices=list(NUMERIC_INTERVALS.keys()),
1009
- value=list(NUMERIC_INTERVALS.keys()),
1010
- interactive=True,
1011
- elem_id="filter-columns-size",
1012
- )
1013
-
1014
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
1015
-
1016
- leaderboard_table = gr.components.Dataframe(
1017
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1018
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1019
- datatype=TYPES,
1020
- elem_id="leaderboard-table",
1021
- interactive=False,
1022
- visible=True,
1023
- )
1024
-
1025
- # Dummy leaderboard for handling the case when the user uses backspace key
1026
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
1027
- value=datasets_original_df[DATASET_COLS],
1028
- headers=DATASET_COLS,
1029
- datatype=TYPES,
1030
- visible=False,
1031
- )
1032
-
1033
-
1034
- search_bar.submit(
1035
- update_table,
1036
- [
1037
- hidden_leaderboard_table_for_search,
1038
- shown_columns,
1039
- search_bar,
1040
- filter_columns_type,
1041
- filter_domain_specific,
1042
- filter_columns_size
1043
- # filter_columns_architecture
1044
- ],
1045
- leaderboard_table,
1046
- )
1047
- for selector in [
1048
- shown_columns,
1049
- filter_columns_type,
1050
- filter_domain_specific,
1051
- # filter_columns_architecture,
1052
- filter_columns_size,
1053
- # deleted_models_visibility,
1054
- ]:
1055
- selector.change(
1056
- update_table,
1057
- [
1058
- hidden_leaderboard_table_for_search,
1059
- shown_columns,
1060
- search_bar,
1061
- filter_columns_type,
1062
- filter_domain_specific,
1063
- filter_columns_size
1064
- # filter_columns_architecture,
1065
- ],
1066
- leaderboard_table,
1067
- queue=True,
1068
- )
1069
-
1070
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=5):
1071
- gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
1072
- gr.HTML(FIVE_PILLAR_DIAGRAM)
1073
- gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
1074
- # gr.HTML(EVALUATION_EXAMPLE_IMG, elem_classes="logo")
1075
- # gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
1076
- # gr.HTML(ENTITY_DISTRIBUTION_IMG, elem_classes="logo")
1077
- # gr.Markdown(LLM_BENCHMARKS_TEXT_3, elem_classes="markdown-text")
1078
-
1079
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=6):
1080
- with gr.Column():
1081
- with gr.Row():
1082
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
1083
-
1084
- with gr.Column():
1085
- with gr.Accordion(
1086
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
1087
- open=False,
1088
- ):
1089
- with gr.Row():
1090
- finished_eval_table = gr.components.Dataframe(
1091
- value=finished_eval_queue_df,
1092
- headers=EVAL_COLS,
1093
- datatype=EVAL_TYPES,
1094
- row_count=5,
1095
  )
1096
- with gr.Accordion(
1097
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
1098
- open=False,
1099
- ):
1100
- with gr.Row():
1101
- running_eval_table = gr.components.Dataframe(
1102
- value=running_eval_queue_df,
1103
- headers=EVAL_COLS,
1104
- datatype=EVAL_TYPES,
1105
- row_count=5,
1106
  )
1107
-
1108
- with gr.Accordion(
1109
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
1110
- open=False,
1111
- ):
1112
- with gr.Row():
1113
- pending_eval_table = gr.components.Dataframe(
1114
- value=pending_eval_queue_df,
1115
- headers=EVAL_COLS,
1116
- datatype=EVAL_TYPES,
1117
- row_count=5,
1118
  )
1119
- with gr.Row():
1120
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
1121
 
1122
- with gr.Row():
1123
- with gr.Column():
1124
- model_name_textbox = gr.Textbox(label="Model name")
1125
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
1126
- model_type = gr.Dropdown(
1127
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
1128
- label="Model type",
1129
- multiselect=False,
1130
- value=None,
1131
- interactive=True,
1132
  )
1133
-
1134
- with gr.Column():
1135
- precision = gr.Dropdown(
1136
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
1137
- label="Precision",
1138
- multiselect=False,
1139
- value="auto",
1140
- interactive=True,
1141
  )
1142
- weight_type = gr.Dropdown(
1143
- choices=[i.value.name for i in WeightType],
1144
- label="Weights type",
1145
- multiselect=False,
1146
- value=WeightType.Original.value.name,
1147
- interactive=False,
 
 
 
 
 
 
 
1148
  )
1149
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)", interactive=False)
1150
- with gr.Row():
1151
- domain_specific_toggle = gr.Checkbox(
1152
- label="Domain specific",
1153
- value=False,
1154
- info="Is your model medically oriented?",
1155
- )
1156
- chat_template_toggle = gr.Checkbox(
1157
- label="Use chat template",
1158
- value=False,
1159
- info="Is your model a chat model?",
1160
- )
1161
-
1162
- submit_button = gr.Button("Submit Eval")
1163
- submission_result = gr.Markdown()
1164
- submit_button.click(
1165
- add_new_eval,
1166
- [
1167
- model_name_textbox,
1168
- base_model_name_textbox,
1169
- revision_name_textbox,
1170
- model_type,
1171
- domain_specific_toggle,
1172
- chat_template_toggle,
1173
- precision,
1174
- weight_type
1175
- ],
1176
- submission_result,
1177
- )
1178
-
1179
-
1180
- with gr.Row():
1181
- with gr.Accordion("📙 Citation", open=False):
1182
  citation_button = gr.Textbox(
1183
  value=CITATION_BUTTON_TEXT,
1184
  label=CITATION_BUTTON_LABEL,
@@ -1190,4 +1066,4 @@ with demo:
1190
  scheduler = BackgroundScheduler()
1191
  scheduler.add_job(restart_space, "interval", seconds=1800)
1192
  scheduler.start()
1193
- demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'])
 
31
  MEDICAL_SUMMARIZATION_BENCHMARK_COLS,
32
  ACI_BENCHMARK_COLS,
33
  SOAP_BENCHMARK_COLS,
34
+ #CLOSED_ENDED_ARABIC_BENCHMARK_COLS,
35
  DATASET_COLS,
36
  OPEN_ENDED_COLS,
37
  MED_SAFETY_COLS,
38
  MEDICAL_SUMMARIZATION_COLS,
39
  ACI_COLS,
40
  SOAP_COLS,
41
+ #CLOSED_ENDED_ARABIC_COLS,
42
  EVAL_COLS,
43
  EVAL_TYPES,
44
  NUMERIC_INTERVALS,
 
50
  Precision,
51
  WeightType,
52
  fields,
53
+ render_generation_templates,
54
+ OpenEndedArabic_COLS,
55
+ OpenEndedArabic_BENCHMARK_COLS,
56
+ OpenEndedFrench_COLS,
57
+ OpenEndedFrench_BENCHMARK_COLS,
58
+ OpenEndedPortuguese_COLS,
59
+ OpenEndedPortuguese_BENCHMARK_COLS,
60
+ OpenEndedRomanian_COLS,
61
+ OpenEndedRomanian_BENCHMARK_COLS,
62
+ OpenEndedGreek_COLS,
63
+ OpenEndedGreek_BENCHMARK_COLS,
64
+ OpenEndedSpanish_COLS,
65
+ OpenEndedSpanish_BENCHMARK_COLS,
66
+ ClosedEndedMultilingual_COLS,
67
+ ClosedEndedMultilingual_BENCHMARK_COLS,
68
+
69
+
70
  )
71
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, PRIVATE_REPO
72
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
112
  _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
113
  soap_leaderboard_df = soap_original_df.copy()
114
 
115
+
116
+ _, open_ended_arabic_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, "score", "open_ended_arabic")
117
+ _, open_ended_french_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, "score", "open_ended_french")
118
+ _, open_ended_portuguese_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS, "score", "open_ended_portuguese")
119
+ _, open_ended_romanian_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedRomanian_COLS, OpenEndedRomanian_BENCHMARK_COLS, "score", "open_ended_romanian")
120
+ _, open_ended_greek_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedGreek_COLS, OpenEndedGreek_BENCHMARK_COLS, "score", "open_ended_greek")
121
+ _, open_ended_spanish_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS, "score", "open_ended_spanish")
122
+ _, closed_ended_multilingual_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS, "score", "closed_ended_multilingual")
123
+
124
+
125
+ open_ended_arabic_leaderboard_df = open_ended_arabic_df.copy()
126
+ open_ended_french_leaderboard_df = open_ended_french_df.copy()
127
+ open_ended_portuguese_leaderboard_df = open_ended_portuguese_df.copy()
128
+ open_ended_romanian_leaderboard_df = open_ended_romanian_df.copy()
129
+ open_ended_greek_leaderboard_df = open_ended_greek_df.copy()
130
+ open_ended_spanish_leaderboard_df = open_ended_spanish_df.copy()
131
+ closed_ended_multilingual_leaderboard_df = closed_ended_multilingual_df.copy()
132
+
133
+
134
+ # if PRIVATE_REPO:
135
+ # _, closed_ended_arabic_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, CLOSED_ENDED_ARABIC_COLS, CLOSED_ENDED_ARABIC_BENCHMARK_COLS, "score", "closed_ended_arabic")
136
+ # closed_ended_arabic_leaderboard_df = closed_ended_arabic_original_df.copy()
137
 
138
  # breakpoint()
139
  # # Token based results
 
171
  elif subset == "soap":
172
  leaderboard_table_df = soap_leaderboard_df.copy()
173
  hidden_leader_board_df = soap_original_df
174
+ elif subset == "open_ended_arabic":
175
+ leaderboard_table_df = open_ended_arabic_df.copy()
176
+ hidden_leader_board_df = open_ended_arabic_df
177
+ elif subset == "open_ended_french":
178
+ leaderboard_table_df = open_ended_french_df.copy()
179
+ hidden_leader_board_df = open_ended_french_df
180
+ elif subset == "open_ended_portuguese":
181
+ leaderboard_table_df = open_ended_portuguese_df.copy()
182
+ hidden_leader_board_df = open_ended_portuguese_df
183
+ elif subset == "open_ended_romanian":
184
+ leaderboard_table_df = open_ended_romanian_df.copy()
185
+ hidden_leader_board_df = open_ended_romanian_df
186
+ elif subset == "open_ended_greek":
187
+ leaderboard_table_df = open_ended_greek_df.copy()
188
+ hidden_leader_board_df = open_ended_greek_df
189
+ elif subset == "open_ended_spanish":
190
+ leaderboard_table_df = open_ended_spanish_df.copy()
191
+ hidden_leader_board_df = open_ended_spanish_df
192
+ elif subset == "closed_ended_multilingual":
193
+ leaderboard_table_df = closed_ended_multilingual_df.copy()
194
+ hidden_leader_board_df = closed_ended_multilingual_df
195
+
196
  # else:
197
  # match evaluation_metric:
198
  # case "Span Based":
 
312
  demo = gr.Blocks(css=custom_css)
313
  with demo:
314
  print("hello")
 
 
315
  gr.HTML(LOGO)
316
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
317
+
318
+
319
+
320
 
321
+
322
+ with gr.Blocks() as demo:
323
+ with gr.Tabs(elem_classes="tab-buttons") as outer_tabs:
324
+ with gr.TabItem("🏅 Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=11):
325
+ with gr.Tabs(elem_classes="tab-buttons6") as language_tabs:
326
+ LANGUAGES = {
327
+ "🇺🇸 English": "open_ended",
328
+ "🇦🇪 Arabic": "open_ended_arabic",
329
+ "🇫🇷 French": "open_ended_french",
330
+ "🇪🇸 Spanish": "open_ended_spanish",
331
+ "🇵🇹 Portuguese": "open_ended_portuguese",
332
+ "🇷🇴 Romanian": "open_ended_romanian",
333
+ "🇬🇷 Greek": "open_ended_greek",
334
+ }
335
+
336
+ for idx, (label, subset) in enumerate(LANGUAGES.items()):
337
+ with gr.TabItem(label, elem_id=f"llm-benchmark-tab-open-{subset}", id=idx):
338
+ # Custom judge information for each language
339
+ if label == "🇺🇸 English":
340
+ judge_text = "**Note:** Llama 3.1 70B Instruct has been used as judge for English."
341
+ else:
342
+ judge_text = "**Note:** Qwen 2.5 72B Instruct has been used as judge for this language."
343
+
344
+ gr.Markdown(judge_text, elem_classes="markdown-text")
345
+
346
+ with gr.Row():
347
+ with gr.Column():
348
+ with gr.Row():
349
+ search_bar = gr.Textbox(
350
+ placeholder=f"🔍 Search for your model in {label}...",
351
+ show_label=False,
352
+ elem_id=f"search-bar-{subset}",
353
+ )
354
+ with gr.Row():
355
+ shown_columns = gr.CheckboxGroup(
356
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)],
357
+ value=[
358
+ c.name
359
+ for c in fields(AutoEvalColumn)
360
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)
361
+ ],
362
+ label="Select columns to show",
363
+ elem_id=f"column-select-{subset}",
364
+ interactive=True,
365
+ )
366
+ with gr.Column(min_width=320):
367
+ filter_columns_type = gr.CheckboxGroup(
368
+ label="Model Types",
369
+ choices=[t.to_str() for t in ModelType],
370
+ value=[t.to_str() for t in ModelType],
371
+ interactive=True,
372
+ elem_id=f"filter-columns-type-{subset}",
373
+ )
374
+ filter_domain_specific = gr.CheckboxGroup(
375
+ label="Domain Specificity",
376
+ choices=["🏥 Clinical models", "Generic models"],
377
+ value=["🏥 Clinical models", "Generic models"],
378
+ interactive=True,
379
+ elem_id=f"filter-columns-domain-{subset}",
380
+ )
381
+ filter_columns_size = gr.CheckboxGroup(
382
+ label="Model sizes (in billions of parameters)",
383
+ choices=list(NUMERIC_INTERVALS.keys()),
384
+ value=list(NUMERIC_INTERVALS.keys()),
385
+ interactive=True,
386
+ elem_id=f"filter-columns-size-{subset}",
387
+ )
388
+
389
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset=subset)
390
+
391
+ leaderboard_table = gr.Dataframe(
392
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
393
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
394
+ datatype=TYPES,
395
+ elem_id=f"leaderboard-table-{subset}",
396
+ interactive=False,
397
+ visible=True,
398
  )
399
+
400
+ hidden_leaderboard_table_for_search = gr.Dataframe(
401
+ value=datasets_original_df[OPEN_ENDED_COLS],
402
+ headers=OPEN_ENDED_COLS,
403
+ datatype=TYPES,
404
+ visible=False,
405
+ )
406
+
407
+ search_bar.submit(
408
+ update_table,
409
+ [
410
+ hidden_leaderboard_table_for_search,
411
+ shown_columns,
412
+ search_bar,
413
+ filter_columns_type,
414
+ filter_domain_specific,
415
+ filter_columns_size
416
  ],
417
+ leaderboard_table,
 
 
418
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
 
420
+ for selector in [
421
+ shown_columns,
422
+ filter_columns_type,
423
+ filter_domain_specific,
424
+ filter_columns_size,
425
+ ]:
426
+ selector.change(
427
+ update_table,
428
+ [
429
+ hidden_leaderboard_table_for_search,
430
+ shown_columns,
431
+ search_bar,
432
+ filter_columns_type,
433
+ filter_domain_specific,
434
+ filter_columns_size
435
+ ],
436
+ leaderboard_table,
437
+ queue=True,
438
+ )
439
+
440
+ with gr.Accordion("💬 Generation templates", open=False):
441
+ with gr.Accordion("Response generation", open=False):
442
+ render_generation_templates(task="open_ended", generation_type="response_generation")
443
+ with gr.Accordion("Scoring Rubric", open=False):
444
+ render_generation_templates(task="open_ended", generation_type="scoring_rubric")
445
+
446
 
 
 
 
 
 
 
 
 
447
 
 
 
 
 
 
 
 
448
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  with gr.TabItem("🏅 Med Safety", elem_id="llm-benchmark-tab-table", id=2):
450
  with gr.Row():
451
  with gr.Column():
 
453
  search_bar = gr.Textbox(
454
  placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
455
  show_label=False,
456
+ elem_id="search-bar-med-safety",
457
  )
458
  with gr.Row():
459
  shown_columns = gr.CheckboxGroup(
 
464
  if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)
465
  ],
466
  label="Select columns to show",
467
+ elem_id="column-select-med-safety",
468
  interactive=True,
469
  )
 
 
 
 
470
  with gr.Column(min_width=320):
 
471
  filter_columns_type = gr.CheckboxGroup(
472
  label="Model Types",
473
  choices=[t.to_str() for t in ModelType],
474
  value=[t.to_str() for t in ModelType],
475
  interactive=True,
476
+ elem_id="filter-columns-type-med-safety",
477
  )
 
 
 
 
 
 
 
478
  filter_domain_specific = gr.CheckboxGroup(
479
  label="Domain Specificity",
480
  choices=["🏥 Clinical models", "Generic models"],
481
  value=["🏥 Clinical models", "Generic models"],
482
  interactive=True,
483
+ elem_id="filter-domain-specific-med-safety",
484
  )
485
  filter_columns_size = gr.CheckboxGroup(
486
  label="Model sizes (in billions of parameters)",
487
  choices=list(NUMERIC_INTERVALS.keys()),
488
  value=list(NUMERIC_INTERVALS.keys()),
489
  interactive=True,
490
+ elem_id="filter-columns-size-med-safety",
491
  )
492
 
493
  datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="med_safety")
494
 
495
+ leaderboard_table = gr.Dataframe(
496
  value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
497
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
498
  datatype=TYPES,
499
+ elem_id="leaderboard-table-med-safety",
500
  interactive=False,
501
  visible=True,
502
  )
503
 
504
+ hidden_leaderboard_table_for_search = gr.Dataframe(
 
505
  value=datasets_original_df[MED_SAFETY_COLS],
506
  headers=MED_SAFETY_COLS,
507
  datatype=TYPES,
508
  visible=False,
509
  )
510
+
 
511
  search_bar.submit(
512
  update_table,
513
  [
 
517
  filter_columns_type,
518
  filter_domain_specific,
519
  filter_columns_size
 
520
  ],
521
  leaderboard_table,
522
  )
523
+
524
  for selector in [
525
  shown_columns,
526
  filter_columns_type,
527
  filter_domain_specific,
528
  filter_columns_size,
 
529
  ]:
530
  selector.change(
531
  update_table,
 
540
  leaderboard_table,
541
  queue=True,
542
  )
543
+
544
  with gr.Accordion("💬 Generation templates", open=False):
545
  with gr.Accordion("Response generation", open=False):
546
  system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="response_generation")
547
  with gr.Accordion("Scoring Rubric", open=False):
548
  system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="scoring_rubric")
549
+
550
  with gr.TabItem("🏅 Medical Summarization", elem_id="llm-benchmark-tab-table", id=3):
551
  gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
552
  with gr.Row():
 
555
  search_bar = gr.Textbox(
556
  placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
557
  show_label=False,
558
+ elem_id="search-bar-med-summarization",
559
  )
560
  with gr.Row():
561
  shown_columns = gr.CheckboxGroup(
 
566
  if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)
567
  ],
568
  label="Select columns to show",
569
+ elem_id="column-select-med-summarization",
570
  interactive=True,
571
  )
 
 
 
 
572
  with gr.Column(min_width=320):
 
573
  filter_columns_type = gr.CheckboxGroup(
574
  label="Model Types",
575
  choices=[t.to_str() for t in ModelType],
576
  value=[t.to_str() for t in ModelType],
577
  interactive=True,
578
+ elem_id="filter-columns-type-med-summarization",
579
  )
 
 
 
 
 
 
 
580
  filter_domain_specific = gr.CheckboxGroup(
581
  label="Domain Specificity",
582
  choices=["🏥 Clinical models", "Generic models"],
583
  value=["🏥 Clinical models", "Generic models"],
584
  interactive=True,
585
+ elem_id="filter-domain-specific-med-summarization",
586
  )
587
  filter_columns_size = gr.CheckboxGroup(
588
  label="Model sizes (in billions of parameters)",
589
  choices=list(NUMERIC_INTERVALS.keys()),
590
  value=list(NUMERIC_INTERVALS.keys()),
591
  interactive=True,
592
+ elem_id="filter-columns-size-med-summarization",
593
  )
594
 
595
  datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="medical_summarization")
596
 
597
+ leaderboard_table = gr.Dataframe(
598
  value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
599
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
600
  datatype=TYPES,
601
+ elem_id="leaderboard-table-med-summarization",
602
  interactive=False,
603
  visible=True,
604
  )
605
 
606
+ hidden_leaderboard_table_for_search = gr.Dataframe(
 
607
  value=datasets_original_df[MEDICAL_SUMMARIZATION_COLS],
608
  headers=MEDICAL_SUMMARIZATION_COLS,
609
  datatype=TYPES,
610
  visible=False,
611
  )
612
+
 
613
  search_bar.submit(
614
  update_table,
615
  [
 
619
  filter_columns_type,
620
  filter_domain_specific,
621
  filter_columns_size
 
622
  ],
623
  leaderboard_table,
624
  )
625
+
626
  for selector in [
627
  shown_columns,
628
  filter_columns_type,
629
  filter_domain_specific,
630
  filter_columns_size,
 
631
  ]:
632
  selector.change(
633
  update_table,
 
642
  leaderboard_table,
643
  queue=True,
644
  )
645
+
646
  with gr.Accordion("💬 Generation templates", open=False):
647
  with gr.Accordion("Response generation", open=False):
648
  system_prompt, user_prompt = render_generation_templates(task="medical_summarization", generation_type="response_generation")
649
  with gr.Accordion("Question generation", open=False):
650
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
651
  with gr.Accordion("Cross Examination", open=False):
652
+ system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
653
+
654
  with gr.TabItem("🏅 Note generation", elem_id="llm-benchmark-tab-table", id=4):
655
  gr.Markdown(NOTE_GENERATION_METRICS, elem_classes="markdown-text")
656
+ with gr.Tabs(elem_classes="tab-buttons2") as note_tabs:
657
+ with gr.TabItem("ACI Bench", elem_id="llm-benchmark-tab-aci", id=0):
658
  with gr.Row():
659
  with gr.Column():
660
  with gr.Row():
661
  search_bar = gr.Textbox(
662
  placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
663
  show_label=False,
664
+ elem_id="search-bar-aci",
665
  )
666
  with gr.Row():
667
  shown_columns = gr.CheckboxGroup(
 
672
  if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)
673
  ],
674
  label="Select columns to show",
675
+ elem_id="column-select-aci",
676
  interactive=True,
677
  )
 
 
 
 
678
  with gr.Column(min_width=320):
 
679
  filter_columns_type = gr.CheckboxGroup(
680
  label="Model Types",
681
  choices=[t.to_str() for t in ModelType],
682
  value=[t.to_str() for t in ModelType],
683
  interactive=True,
684
+ elem_id="filter-columns-type-aci",
685
  )
 
 
 
 
 
 
 
686
  filter_domain_specific = gr.CheckboxGroup(
687
  label="Domain Specificity",
688
  choices=["🏥 Clinical models", "Generic models"],
689
  value=["🏥 Clinical models", "Generic models"],
690
  interactive=True,
691
+ elem_id="filter-domain-specific-aci",
692
  )
693
  filter_columns_size = gr.CheckboxGroup(
694
  label="Model sizes (in billions of parameters)",
695
  choices=list(NUMERIC_INTERVALS.keys()),
696
  value=list(NUMERIC_INTERVALS.keys()),
697
  interactive=True,
698
+ elem_id="filter-columns-size-aci",
699
  )
700
 
701
  datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="aci")
702
 
703
+ leaderboard_table = gr.Dataframe(
704
  value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
705
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
706
  datatype=TYPES,
707
+ elem_id="leaderboard-table-aci",
708
  interactive=False,
709
  visible=True,
710
  )
711
 
712
+ hidden_leaderboard_table_for_search = gr.Dataframe(
 
713
  value=datasets_original_df[ACI_COLS],
714
  headers=ACI_COLS,
715
  datatype=TYPES,
716
  visible=False,
717
  )
718
+
 
719
  search_bar.submit(
720
  update_table,
721
  [
 
725
  filter_columns_type,
726
  filter_domain_specific,
727
  filter_columns_size
 
728
  ],
729
  leaderboard_table,
730
  )
731
+
732
  for selector in [
733
  shown_columns,
734
  filter_columns_type,
735
  filter_domain_specific,
736
  filter_columns_size,
 
737
  ]:
738
  selector.change(
739
  update_table,
 
748
  leaderboard_table,
749
  queue=True,
750
  )
751
+
752
+ with gr.TabItem("SOAP Notes", elem_id="llm-benchmark-tab-soap", id=1):
753
  with gr.Row():
754
  with gr.Column():
755
  with gr.Row():
756
  search_bar = gr.Textbox(
757
  placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
758
  show_label=False,
759
+ elem_id="search-bar-soap",
760
  )
761
  with gr.Row():
762
  shown_columns = gr.CheckboxGroup(
 
767
  if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)
768
  ],
769
  label="Select columns to show",
770
+ elem_id="column-select-soap",
771
  interactive=True,
772
  )
 
 
 
 
773
  with gr.Column(min_width=320):
 
774
  filter_columns_type = gr.CheckboxGroup(
775
  label="Model Types",
776
  choices=[t.to_str() for t in ModelType],
777
  value=[t.to_str() for t in ModelType],
778
  interactive=True,
779
+ elem_id="filter-columns-type-soap",
780
  )
 
 
 
 
 
 
 
781
  filter_domain_specific = gr.CheckboxGroup(
782
  label="Domain Specificity",
783
  choices=["🏥 Clinical models", "Generic models"],
784
  value=["🏥 Clinical models", "Generic models"],
785
  interactive=True,
786
+ elem_id="filter-domain-specific-soap",
787
  )
788
  filter_columns_size = gr.CheckboxGroup(
789
  label="Model sizes (in billions of parameters)",
790
  choices=list(NUMERIC_INTERVALS.keys()),
791
  value=list(NUMERIC_INTERVALS.keys()),
792
  interactive=True,
793
+ elem_id="filter-columns-size-soap",
794
  )
795
 
796
  datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="soap")
797
 
798
+ leaderboard_table = gr.Dataframe(
799
  value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
800
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
801
  datatype=TYPES,
802
+ elem_id="leaderboard-table-soap",
803
  interactive=False,
804
  visible=True,
805
  )
806
 
807
+ hidden_leaderboard_table_for_search = gr.Dataframe(
 
808
  value=datasets_original_df[SOAP_COLS],
809
  headers=SOAP_COLS,
810
  datatype=TYPES,
811
  visible=False,
812
  )
813
+
 
814
  search_bar.submit(
815
  update_table,
816
  [
 
820
  filter_columns_type,
821
  filter_domain_specific,
822
  filter_columns_size
 
823
  ],
824
  leaderboard_table,
825
  )
826
+
827
  for selector in [
828
  shown_columns,
829
  filter_columns_type,
830
  filter_domain_specific,
831
  filter_columns_size,
 
832
  ]:
833
  selector.change(
834
  update_table,
 
843
  leaderboard_table,
844
  queue=True,
845
  )
846
+
847
  with gr.Accordion("💬 Generation templates", open=False):
848
  with gr.Accordion("ACI-Bench Response generation", open=False):
849
  system_prompt, user_prompt = render_generation_templates(task="aci", generation_type="response_generation")
 
852
  with gr.Accordion("Question generation", open=False):
853
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
854
  with gr.Accordion("Cross Examination", open=False):
855
+ system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
856
+
857
+ with gr.TabItem("🏅 Closed Ended Evaluation", elem_id="llm-benchmark-tab-closed", id=6):
858
+ with gr.Tabs(elem_classes="tab-buttons2") as closed_tabs:
859
+ # ENGLISH TAB
860
+ with gr.TabItem("English", elem_id="llm-benchmark-tab-closed-english", id=0):
861
+ with gr.Row():
862
+ with gr.Column():
863
+ with gr.Row():
864
+ search_bar = gr.Textbox(
865
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
866
+ show_label=False,
867
+ elem_id="search-bar-closed-english",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
868
  )
869
+ with gr.Row():
870
+ shown_columns = gr.CheckboxGroup(
871
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
872
+ value=[
873
+ c.name
874
+ for c in fields(AutoEvalColumn)
875
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)
876
+ ],
877
+ label="Select columns to show",
878
+ elem_id="column-select-closed-english",
879
  interactive=True,
 
880
  )
881
+ with gr.Column(min_width=320):
882
+ filter_columns_type = gr.CheckboxGroup(
883
+ label="Model Types",
884
+ choices=[t.to_str() for t in ModelType],
885
+ value=[t.to_str() for t in ModelType],
886
+ interactive=True,
887
+ elem_id="filter-columns-type-closed-english",
888
+ )
889
+ filter_domain_specific = gr.CheckboxGroup(
890
+ label="Domain Specificity",
891
+ choices=["🏥 Clinical models", "Generic models"],
892
+ value=["🏥 Clinical models", "Generic models"],
893
+ interactive=True,
894
+ elem_id="filter-domain-specific-closed-english",
895
+ )
896
+ filter_columns_size = gr.CheckboxGroup(
897
+ label="Model sizes (in billions of parameters)",
898
+ choices=list(NUMERIC_INTERVALS.keys()),
899
+ value=list(NUMERIC_INTERVALS.keys()),
900
+ interactive=True,
901
+ elem_id="filter-columns-size-closed-english",
902
+ )
903
 
904
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
905
+ leaderboard_table = gr.components.Dataframe(
906
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
907
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
908
+ datatype=TYPES,
909
+ elem_id="leaderboard-table-english",
910
+ interactive=False,
911
+ visible=True,
912
+ )
913
+
914
+ # Dummy leaderboard for handling the case when the user uses backspace key
915
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
916
+ value=datasets_original_df[DATASET_COLS],
917
+ headers=DATASET_COLS,
918
+ datatype=TYPES,
919
+ visible=False,
920
+ )
921
+
922
+ search_bar.submit(
923
+ update_table,
924
+ [
925
+ hidden_leaderboard_table_for_search,
926
+ shown_columns,
927
+ search_bar,
928
+ filter_columns_type,
929
+ filter_domain_specific,
930
+ filter_columns_size
931
+ ],
932
+ leaderboard_table,
933
+ )
934
+
935
+ for selector in [
936
+ shown_columns,
937
+ filter_columns_type,
938
+ filter_domain_specific,
939
+ filter_columns_size,
940
+ ]:
941
+ selector.change(
942
  update_table,
943
  [
944
  hidden_leaderboard_table_for_search,
 
947
  filter_columns_type,
948
  filter_domain_specific,
949
  filter_columns_size
 
950
  ],
951
  leaderboard_table,
952
+ queue=True,
953
  )
954
+
955
+ #MULTILINGUAL TAB - Same level as English tab
956
+ with gr.TabItem("🌍 Multilingual", elem_id="llm-benchmark-tab-table9", id=1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
957
  with gr.Row():
958
+ gr.Markdown("📊 **Dataset Information:** This tab uses the Global MMLU dataset filtering only the subcategory: medical (10.7%)")
959
+
 
 
 
960
  with gr.Row():
961
+ with gr.Column():
962
+ with gr.Row():
963
+ search_bar = gr.Textbox(
964
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
965
+ show_label=False,
966
+ elem_id="search-bar",
967
+ )
968
+
969
+ with gr.Row():
970
+ shown_columns = gr.CheckboxGroup(
971
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)],
972
+ value=[
973
+ c.name
974
+ for c in fields(AutoEvalColumn)
975
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)
976
+ ],
977
+ label="Select columns to show",
978
+ elem_id="column-select",
979
+ interactive=True,
980
+ )
981
+ with gr.Column(min_width=320):
982
+ # with gr.Box(elem_id="box-filter"):
983
+ filter_columns_type = gr.CheckboxGroup(
984
+ label="Model Types",
985
+ choices=[t.to_str() for t in ModelType],
986
+ value=[t.to_str() for t in ModelType],
987
+ interactive=True,
988
+ elem_id="filter-columns-type",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
989
  )
990
+ filter_domain_specific = gr.CheckboxGroup(
991
+ label="Domain Specificity",
992
+ choices=["🏥 Clinical models", "Generic models"],
993
+ value=["🏥 Clinical models", "Generic models"],
994
+ interactive=True,
995
+ elem_id="filter-columns-type",
 
 
 
 
996
  )
997
+ filter_columns_size = gr.CheckboxGroup(
998
+ label="Model sizes (in billions of parameters)",
999
+ choices=list(NUMERIC_INTERVALS.keys()),
1000
+ value=list(NUMERIC_INTERVALS.keys()),
1001
+ interactive=True,
1002
+ elem_id="filter-columns-size",
 
 
 
 
 
1003
  )
 
 
1004
 
1005
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="closed_ended_multilingual")
1006
+ leaderboard_table = gr.components.Dataframe(
1007
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1008
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1009
+ datatype=TYPES,
1010
+ elem_id="leaderboard-table",
1011
+ interactive=False,
1012
+ visible=True,
 
 
1013
  )
1014
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
1015
+ value=datasets_original_df[ClosedEndedMultilingual_COLS],
1016
+ headers=ClosedEndedMultilingual_COLS,
1017
+ datatype=TYPES,
1018
+ visible=False,
 
 
 
1019
  )
1020
+
1021
+ search_bar.submit(
1022
+ update_table,
1023
+ [
1024
+ hidden_leaderboard_table_for_search,
1025
+ shown_columns,
1026
+ search_bar,
1027
+ filter_columns_type,
1028
+ filter_domain_specific,
1029
+ filter_columns_size
1030
+ # filter_columns_architecture
1031
+ ],
1032
+ leaderboard_table,
1033
  )
1034
+ for selector in [
1035
+ shown_columns,
1036
+ filter_columns_type,
1037
+ filter_domain_specific,
1038
+ # filter_columns_architecture,
1039
+ filter_columns_size,
1040
+ # deleted_models_visibility,
1041
+ ]:
1042
+ selector.change(
1043
+ update_table,
1044
+ [
1045
+ hidden_leaderboard_table_for_search,
1046
+ shown_columns,
1047
+ search_bar,
1048
+ filter_columns_type,
1049
+ filter_domain_specific,
1050
+ filter_columns_size
1051
+ # filter_columns_architecture,
1052
+ ],
1053
+ leaderboard_table,
1054
+ queue=True,
1055
+ )
1056
+ with gr.Row():
1057
+ with gr.Accordion("📙 Citation", open=False):
 
 
 
 
 
 
 
 
 
1058
  citation_button = gr.Textbox(
1059
  value=CITATION_BUTTON_TEXT,
1060
  label=CITATION_BUTTON_LABEL,
 
1066
  scheduler = BackgroundScheduler()
1067
  scheduler.add_job(restart_space, "interval", seconds=1800)
1068
  scheduler.start()
1069
+ demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'])
app_original.py ADDED
@@ -0,0 +1,1276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+ from apscheduler.schedulers.background import BackgroundScheduler
6
+ from huggingface_hub import snapshot_download
7
+
8
+ from src.about import (
9
+ CITATION_BUTTON_LABEL,
10
+ CITATION_BUTTON_TEXT,
11
+ EVALUATION_QUEUE_TEXT,
12
+ INTRODUCTION_TEXT,
13
+ LLM_BENCHMARKS_TEXT_1,
14
+ LLM_BENCHMARKS_TEXT_2,
15
+ CROSS_EVALUATION_METRICS,
16
+ NOTE_GENERATION_METRICS,
17
+ # EVALUATION_EXAMPLE_IMG,
18
+ # LLM_BENCHMARKS_TEXT_2,
19
+ # ENTITY_DISTRIBUTION_IMG,
20
+ # LLM_BENCHMARKS_TEXT_3,
21
+ TITLE,
22
+ LOGO,
23
+ FIVE_PILLAR_DIAGRAM
24
+ )
25
+ from src.display.css_html_js import custom_css
26
+ # changes to be made here
27
+ from src.display.utils import (
28
+ DATASET_BENCHMARK_COLS,
29
+ OPEN_ENDED_BENCHMARK_COLS,
30
+ MED_SAFETY_BENCHMARK_COLS,
31
+ MEDICAL_SUMMARIZATION_BENCHMARK_COLS,
32
+ ACI_BENCHMARK_COLS,
33
+ SOAP_BENCHMARK_COLS,
34
+ #CLOSED_ENDED_ARABIC_BENCHMARK_COLS,
35
+ DATASET_COLS,
36
+ OPEN_ENDED_COLS,
37
+ MED_SAFETY_COLS,
38
+ MEDICAL_SUMMARIZATION_COLS,
39
+ ACI_COLS,
40
+ SOAP_COLS,
41
+ #CLOSED_ENDED_ARABIC_COLS,
42
+ EVAL_COLS,
43
+ EVAL_TYPES,
44
+ NUMERIC_INTERVALS,
45
+ TYPES,
46
+ AutoEvalColumn,
47
+ ModelType,
48
+ ModelArch,
49
+ PromptTemplateName,
50
+ Precision,
51
+ WeightType,
52
+ fields,
53
+ render_generation_templates,
54
+ OpenEndedArabic_COLS,
55
+ OpenEndedArabic_BENCHMARK_COLS,
56
+ OpenEndedFrench_COLS,
57
+ OpenEndedFrench_BENCHMARK_COLS,
58
+ OpenEndedPortuguese_COLS,
59
+ OpenEndedPortuguese_BENCHMARK_COLS,
60
+ OpenEndedRomanian_COLS,
61
+ OpenEndedRomanian_BENCHMARK_COLS,
62
+ OpenEndedGreek_COLS,
63
+ OpenEndedGreek_BENCHMARK_COLS,
64
+ OpenEndedSpanish_COLS,
65
+ OpenEndedSpanish_BENCHMARK_COLS,
66
+ ClosedEndedMultilingual_COLS,
67
+ ClosedEndedMultilingual_BENCHMARK_COLS,
68
+
69
+ #closed_ended_multilingual,
70
+ # Open_EndedArabic,
71
+ # Open_EndedSpanish,
72
+ # Open_EndedFrench,
73
+ # Open_EndedPortuguese,
74
+ # Open_EndedRomanian,
75
+ # Open_EndedGreek,
76
+ # Open_EndedSpanish,
77
+ # Open_EndedArabic,
78
+ # Open_EndedFrench,
79
+
80
+ )
81
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, PRIVATE_REPO
82
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
83
+ from src.submission.submit import add_new_eval, PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG
84
+
85
+ def restart_space():
86
+ API.restart_space(repo_id=REPO_ID)
87
+
88
+
89
+ try:
90
+ print(EVAL_REQUESTS_PATH)
91
+ snapshot_download(
92
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
93
+ )
94
+ except Exception:
95
+ restart_space()
96
+ try:
97
+ print(EVAL_RESULTS_PATH)
98
+ snapshot_download(
99
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
100
+ )
101
+ except Exception:
102
+ restart_space()
103
+
104
+ # Span based results
105
+ # changes to be made here
106
+
107
+ _, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
108
+ harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
109
+
110
+ _, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended")
111
+ open_ended_leaderboard_df = open_ended_original_df.copy()
112
+
113
+ _, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
114
+ med_safety_leaderboard_df = med_safety_original_df.copy()
115
+
116
+ _, medical_summarization_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDICAL_SUMMARIZATION_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, "score", "medical_summarization")
117
+ medical_summarization_leaderboard_df = medical_summarization_original_df.copy()
118
+
119
+ _, aci_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ACI_COLS, ACI_BENCHMARK_COLS, "score", "aci")
120
+ aci_leaderboard_df = aci_original_df.copy()
121
+
122
+ _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
123
+ soap_leaderboard_df = soap_original_df.copy()
124
+
125
+
126
+ _, open_ended_arabic_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, "score", "open_ended_arabic")
127
+ _, open_ended_french_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, "score", "open_ended_french")
128
+ _, open_ended_portuguese_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS, "score", "open_ended_portuguese")
129
+ _, open_ended_romanian_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedRomanian_COLS, OpenEndedRomanian_BENCHMARK_COLS, "score", "open_ended_romanian")
130
+ _, open_ended_greek_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedGreek_COLS, OpenEndedGreek_BENCHMARK_COLS, "score", "open_ended_greek")
131
+ _, open_ended_spanish_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS, "score", "open_ended_spanish")
132
+ _, closed_ended_multilingual_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS, "score", "closed_ended_multilingual")
133
+
134
+
135
+ open_ended_arabic_leaderboard_df = open_ended_arabic_df.copy()
136
+ open_ended_french_leaderboard_df = open_ended_french_df.copy()
137
+ open_ended_portuguese_leaderboard_df = open_ended_portuguese_df.copy()
138
+ open_ended_romanian_leaderboard_df = open_ended_romanian_df.copy()
139
+ open_ended_greek_leaderboard_df = open_ended_greek_df.copy()
140
+ open_ended_spanish_leaderboard_df = open_ended_spanish_df.copy()
141
+ closed_ended_multilingual_leaderboard_df = closed_ended_multilingual_df.copy()
142
+
143
+
144
+ # if PRIVATE_REPO:
145
+ # _, closed_ended_arabic_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, CLOSED_ENDED_ARABIC_COLS, CLOSED_ENDED_ARABIC_BENCHMARK_COLS, "score", "closed_ended_arabic")
146
+ # closed_ended_arabic_leaderboard_df = closed_ended_arabic_original_df.copy()
147
+
148
+ # breakpoint()
149
+ # # Token based results
150
+ # _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
151
+ # token_based_datasets_leaderboard_df = token_based_datasets_original_df.copy()
152
+
153
+ # _, token_based_types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "clinical_types")
154
+ # token_based_types_leaderboard_df = token_based_types_original_df.copy()
155
+
156
+
157
+ (
158
+ finished_eval_queue_df,
159
+ running_eval_queue_df,
160
+ pending_eval_queue_df,
161
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
162
+
163
+ # breakpoint()
164
+ def update_df(shown_columns, subset="datasets"):
165
+ # changes to be made here
166
+ if subset == "datasets":
167
+ leaderboard_table_df = harness_datasets_leaderboard_df.copy()
168
+ hidden_leader_board_df = harness_datasets_original_df
169
+ elif subset == "open_ended":
170
+ leaderboard_table_df = open_ended_leaderboard_df.copy()
171
+ hidden_leader_board_df = open_ended_original_df
172
+ elif subset == "med_safety":
173
+ leaderboard_table_df = med_safety_leaderboard_df.copy()
174
+ hidden_leader_board_df = med_safety_original_df
175
+ elif subset == "medical_summarization":
176
+ leaderboard_table_df = medical_summarization_leaderboard_df.copy()
177
+ hidden_leader_board_df = medical_summarization_original_df
178
+ elif subset == "aci":
179
+ leaderboard_table_df = aci_leaderboard_df.copy()
180
+ hidden_leader_board_df = aci_original_df
181
+ elif subset == "soap":
182
+ leaderboard_table_df = soap_leaderboard_df.copy()
183
+ hidden_leader_board_df = soap_original_df
184
+ elif subset == "open_ended_arabic":
185
+ leaderboard_table_df = open_ended_arabic_df.copy()
186
+ hidden_leader_board_df = open_ended_arabic_df
187
+ elif subset == "open_ended_french":
188
+ leaderboard_table_df = open_ended_french_df.copy()
189
+ hidden_leader_board_df = open_ended_french_df
190
+ elif subset == "open_ended_portuguese":
191
+ leaderboard_table_df = open_ended_portuguese_df.copy()
192
+ hidden_leader_board_df = open_ended_portuguese_df
193
+ elif subset == "open_ended_romanian":
194
+ leaderboard_table_df = open_ended_romanian_df.copy()
195
+ hidden_leader_board_df = open_ended_romanian_df
196
+ elif subset == "open_ended_greek":
197
+ leaderboard_table_df = open_ended_greek_df.copy()
198
+ hidden_leader_board_df = open_ended_greek_df
199
+ elif subset == "open_ended_spanish":
200
+ leaderboard_table_df = open_ended_spanish_df.copy()
201
+ hidden_leader_board_df = open_ended_spanish_df
202
+ elif subset == "closed_ended_multilingual":
203
+ leaderboard_table_df = closed_ended_multilingual_df.copy()
204
+ hidden_leader_board_df = closed_ended_multilingual_df
205
+
206
+ # else:
207
+ # match evaluation_metric:
208
+ # case "Span Based":
209
+ # leaderboard_table_df = span_based_types_leaderboard_df.copy()
210
+ # hidden_leader_board_df = span_based_types_original_df
211
+ # case "Token Based":
212
+ # leaderboard_table_df = token_based_types_leaderboard_df.copy()
213
+ # hidden_leader_board_df = token_based_types_original_df
214
+ # case _:
215
+ # pass
216
+
217
+
218
+ value_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns
219
+ # breakpoint()
220
+ return leaderboard_table_df[value_cols], hidden_leader_board_df
221
+
222
+
223
+ # Searching and filtering
224
+ def update_table(
225
+ hidden_df: pd.DataFrame,
226
+ columns: list,
227
+ query: str = "",
228
+ type_query: list = None,
229
+ domain_specific_query: list = None,
230
+ size_query: list = None,
231
+ precision_query: str = None,
232
+ show_deleted: bool = False,
233
+ ):
234
+ # breakpoint()
235
+ filtered_df = filter_models(hidden_df, type_query, domain_specific_query, size_query, precision_query, show_deleted)
236
+ # breakpoint()
237
+ filtered_df = filter_queries(query, filtered_df)
238
+ # breakpoint()
239
+ df = select_columns(filtered_df, columns, list(hidden_df.columns))
240
+ # breakpoint()
241
+ return df
242
+
243
+
244
+ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
245
+ return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
246
+
247
+
248
+ def select_columns(df: pd.DataFrame, columns: list, cols:list) -> pd.DataFrame:
249
+ always_here_cols = [
250
+ AutoEvalColumn.model_type_symbol.name,
251
+ AutoEvalColumn.model.name,
252
+ ]
253
+ # We use COLS to maintain sorting
254
+ filtered_df = df[always_here_cols + [c for c in cols if c in df.columns and c in columns]]
255
+ return filtered_df
256
+
257
+
258
+ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
259
+ final_df = []
260
+ if query != "":
261
+ queries = [q.strip() for q in query.split(";")]
262
+ for _q in queries:
263
+ _q = _q.strip()
264
+ if _q != "":
265
+ temp_filtered_df = search_table(filtered_df, _q)
266
+ if len(temp_filtered_df) > 0:
267
+ final_df.append(temp_filtered_df)
268
+ if len(final_df) > 0:
269
+ filtered_df = pd.concat(final_df)
270
+ filtered_df = filtered_df.drop_duplicates(
271
+ subset=[
272
+ AutoEvalColumn.model.name,
273
+ # AutoEvalColumn.precision.name,
274
+ # AutoEvalColumn.revision.name,
275
+ ]
276
+ )
277
+
278
+ return filtered_df
279
+
280
+
281
+ def filter_models(
282
+ df: pd.DataFrame, type_query: list, domain_specific_query: list, size_query: list, precision_query: list, show_deleted: bool
283
+ ) -> pd.DataFrame:
284
+ # Show all models
285
+ # if show_deleted:
286
+ # filtered_df = df
287
+ # else: # Show only still on the hub models
288
+ # filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
289
+
290
+ filtered_df = df
291
+
292
+ if type_query is not None:
293
+ type_name = [t.split(" ")[1] for t in type_query]
294
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type.name].isin(type_name)]
295
+
296
+ if domain_specific_query is not None:
297
+ domain_specifics = []
298
+ if "🏥 Clinical models" in domain_specific_query:
299
+ domain_specifics.append(True)
300
+ if "Generic models" in domain_specific_query:
301
+ domain_specifics.append(False)
302
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.is_domain_specific.name].isin(domain_specifics)]
303
+
304
+ # if architecture_query is not None:
305
+ # arch_types = [t for t in architecture_query]
306
+ # filtered_df = filtered_df.loc[df[AutoEvalColumn.architecture.name].isin(arch_types)]
307
+ # # filtered_df = filtered_df.loc[df[AutoEvalColumn.architecture.name].isin(architecture_query + ["None"])]
308
+
309
+ if precision_query is not None:
310
+ if AutoEvalColumn.precision.name in df.columns:
311
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
312
+
313
+ if size_query is not None:
314
+ numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
315
+ params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
316
+ mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
317
+ filtered_df = filtered_df.loc[mask]
318
+
319
+ return filtered_df
320
+
321
+
322
+ demo = gr.Blocks(css=custom_css)
323
+ with demo:
324
+ print("hello")
325
+ gr.HTML(LOGO)
326
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
327
+
328
+
329
+
330
+
331
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
332
+ with gr.TabItem("🌍 Open Ended MultilingualEvaluation", elem_id="llm-benchmark-tab-table", id=11):
333
+ with gr.Tabs(elem_classes="tab-buttons6") as tabs:
334
+ with gr.TabItem("English", elem_id="llm-benchmark-tab-table10", id=0):
335
+ with gr.Row():
336
+ with gr.Column():
337
+ with gr.Row():
338
+ search_bar = gr.Textbox(
339
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
340
+ show_label=False,
341
+ elem_id="search-bar",
342
+ )
343
+ with gr.Row():
344
+ shown_columns = gr.CheckboxGroup(
345
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)],
346
+ value=[
347
+ c.name
348
+ for c in fields(AutoEvalColumn)
349
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)
350
+ ],
351
+ label="Select columns to show",
352
+ elem_id="column-select",
353
+ interactive=True,
354
+
355
+ )
356
+
357
+ with gr.Column(min_width=320):
358
+ filter_columns_type = gr.CheckboxGroup(
359
+ label="Model Types",
360
+ choices=[t.to_str() for t in ModelType],
361
+ value=[t.to_str() for t in ModelType],
362
+ interactive=True,
363
+ elem_id="filter-columns-type",
364
+ )
365
+ filter_domain_specific = gr.CheckboxGroup(
366
+ label="Domain Specificity",
367
+ choices=["🏥 Clinical models", "Generic models"],
368
+ value=["🏥 Clinical models", "Generic models"],
369
+ interactive=True,
370
+ elem_id="filter-columns-type",
371
+ )
372
+ filter_domain_specific = gr.CheckboxGroup(
373
+ label="Domain Specificity",
374
+ choices=["🏥 Clinical models", "Generic models"],
375
+ value=["🏥 Clinical models", "Generic models"],
376
+ interactive=True,
377
+ elem_id="filter-columns-type",
378
+ )
379
+ filter_columns_size = gr.CheckboxGroup(
380
+ label="Model sizes (in billions of parameters)",
381
+ choices=list(NUMERIC_INTERVALS.keys()),
382
+ value=list(NUMERIC_INTERVALS.keys()),
383
+ interactive=True,
384
+ elem_id="filter-columns-size",
385
+ )
386
+
387
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="open_ended")
388
+
389
+
390
+ leaderboard_table = gr.components.Dataframe(
391
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
392
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
393
+ datatype=TYPES,
394
+ elem_id="leaderboard-table",
395
+ interactive=False,
396
+ visible=True,
397
+ )
398
+
399
+ # Dummy leaderboard for handling the case when the user uses backspace key
400
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
401
+ value=datasets_original_df[OPEN_ENDED_COLS],
402
+ headers=OPEN_ENDED_COLS,
403
+ datatype=TYPES,
404
+ visible=False,
405
+ )
406
+
407
+
408
+ search_bar.submit(
409
+ update_table,
410
+ [
411
+ hidden_leaderboard_table_for_search,
412
+ shown_columns,
413
+ search_bar,
414
+ filter_columns_type,
415
+ filter_domain_specific,
416
+ filter_columns_size
417
+ # filter_columns_architecture
418
+ ],
419
+ leaderboard_table,
420
+ )
421
+ for selector in [
422
+ shown_columns,
423
+ filter_columns_type,
424
+ filter_domain_specific,
425
+ # filter_columns_architecture,
426
+ filter_columns_size,
427
+ # deleted_models_visibility,
428
+ ]:
429
+ selector.change(
430
+ update_table,
431
+ [
432
+ hidden_leaderboard_table_for_search,
433
+ shown_columns,
434
+ search_bar,
435
+ filter_columns_type,
436
+ filter_domain_specific,
437
+ filter_columns_size
438
+ # filter_columns_architecture,
439
+ ],
440
+ leaderboard_table,
441
+ queue=True,
442
+ )
443
+
444
+
445
+ with gr.Accordion("💬 Generation templates", open=False):
446
+ with gr.Accordion("Response generation", open=False):
447
+ system_prompt, user_prompt = render_generation_templates(task="open_ended", generation_type="response_generation")
448
+ with gr.Accordion("Scoring Rubric", open=False):
449
+ system_prompt, user_prompt = render_generation_templates(task="open_ended", generation_type="scoring_rubric")
450
+
451
+
452
+ with gr.TabItem("🏅 Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
453
+ with gr.Row():
454
+ with gr.Column():
455
+ with gr.Row():
456
+ search_bar = gr.Textbox(
457
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
458
+ show_label=False,
459
+ elem_id="search-bar",
460
+ )
461
+ with gr.Row():
462
+ shown_columns = gr.CheckboxGroup(
463
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)],
464
+ value=[
465
+ c.name
466
+ for c in fields(AutoEvalColumn)
467
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)
468
+ ],
469
+ label="Select columns to show",
470
+ elem_id="column-select",
471
+ interactive=True,
472
+ )
473
+ # with gr.Row():
474
+ # deleted_models_visibility = gr.Checkbox(
475
+ # value=False, label="Show gated/private/deleted models", interactive=True
476
+ # )
477
+ with gr.Column(min_width=320):
478
+ # with gr.Box(elem_id="box-filter"):
479
+ filter_columns_type = gr.CheckboxGroup(
480
+ label="Model Types",
481
+ choices=[t.to_str() for t in ModelType],
482
+ value=[t.to_str() for t in ModelType],
483
+ interactive=True,
484
+ elem_id="filter-columns-type",
485
+ )
486
+ # filter_columns_architecture = gr.CheckboxGroup(
487
+ # label="Architecture Types",
488
+ # choices=[i.value.name for i in ModelArch],
489
+ # value=[i.value.name for i in ModelArch],
490
+ # interactive=True,
491
+ # elem_id="filter-columns-architecture",
492
+ # )
493
+ filter_domain_specific = gr.CheckboxGroup(
494
+ label="Domain Specificity",
495
+ choices=["🏥 Clinical models", "Generic models"],
496
+ value=["🏥 Clinical models", "Generic models"],
497
+ interactive=True,
498
+ elem_id="filter-columns-type",
499
+ )
500
+ filter_columns_size = gr.CheckboxGroup(
501
+ label="Model sizes (in billions of parameters)",
502
+ choices=list(NUMERIC_INTERVALS.keys()),
503
+ value=list(NUMERIC_INTERVALS.keys()),
504
+ interactive=True,
505
+ elem_id="filter-columns-size",
506
+ )
507
+
508
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="open_ended")
509
+
510
+ leaderboard_table = gr.components.Dataframe(
511
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
512
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
513
+ datatype=TYPES,
514
+ elem_id="leaderboard-table",
515
+ interactive=False,
516
+ visible=True,
517
+ )
518
+
519
+ # Dummy leaderboard for handling the case when the user uses backspace key
520
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
521
+ value=datasets_original_df[OPEN_ENDED_COLS],
522
+ headers=OPEN_ENDED_COLS,
523
+ datatype=TYPES,
524
+ visible=False,
525
+ )
526
+
527
+
528
+ search_bar.submit(
529
+ update_table,
530
+ [
531
+ hidden_leaderboard_table_for_search,
532
+ shown_columns,
533
+ search_bar,
534
+ filter_columns_type,
535
+ filter_domain_specific,
536
+ filter_columns_size
537
+ # filter_columns_architecture
538
+ ],
539
+ leaderboard_table,
540
+ )
541
+ for selector in [
542
+ shown_columns,
543
+ filter_columns_type,
544
+ filter_domain_specific,
545
+ # filter_columns_architecture,
546
+ filter_columns_size,
547
+ # deleted_models_visibility,
548
+ ]:
549
+ selector.change(
550
+ update_table,
551
+ [
552
+ hidden_leaderboard_table_for_search,
553
+ shown_columns,
554
+ search_bar,
555
+ filter_columns_type,
556
+ filter_domain_specific,
557
+ filter_columns_size
558
+ # filter_columns_architecture,
559
+ ],
560
+ leaderboard_table,
561
+ queue=True,
562
+ )
563
+ with gr.Accordion("💬 Generation templates", open=False):
564
+ with gr.Accordion("Response generation", open=False):
565
+ system_prompt, user_prompt = render_generation_templates(task="open_ended", generation_type="response_generation")
566
+ with gr.Accordion("Scoring Rubric", open=False):
567
+ system_prompt, user_prompt = render_generation_templates(task="open_ended", generation_type="scoring_rubric")
568
+
569
+ with gr.TabItem("🏅 Med Safety", elem_id="llm-benchmark-tab-table", id=2):
570
+ with gr.Row():
571
+ with gr.Column():
572
+
573
+ with gr.Row():
574
+ search_bar = gr.Textbox(
575
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
576
+ show_label=False,
577
+ elem_id="search-bar",
578
+ )
579
+
580
+
581
+ with gr.Row():
582
+ shown_columns = gr.CheckboxGroup(
583
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)],
584
+ value=[
585
+ c.name
586
+ for c in fields(AutoEvalColumn)
587
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)
588
+ ],
589
+ label="Select columns to show",
590
+ elem_id="column-select",
591
+ interactive=True,
592
+ )
593
+
594
+
595
+ # with gr.Row():
596
+ # deleted_models_visibility = gr.Checkbox(
597
+ # value=False, label="Show gated/private/deleted models", interactive=True
598
+ # )
599
+ with gr.Column(min_width=320):
600
+
601
+ # with gr.Box(elem_id="box-filter"):
602
+ filter_columns_type = gr.CheckboxGroup(
603
+ label="Model Types",
604
+ choices=[t.to_str() for t in ModelType],
605
+ value=[t.to_str() for t in ModelType],
606
+ interactive=True,
607
+ elem_id="filter-columns-type",
608
+ )
609
+
610
+ # filter_columns_architecture = gr.CheckboxGroup(
611
+ # label="Architecture Types",
612
+ # choices=[i.value.name for i in ModelArch],
613
+ # value=[i.value.name for i in ModelArch],
614
+ # interactive=True,
615
+ # elem_id="filter-columns-architecture",
616
+ # )
617
+
618
+ filter_domain_specific = gr.CheckboxGroup(
619
+ label="Domain Specificity",
620
+ choices=["🏥 Clinical models", "Generic models"],
621
+ value=["🏥 Clinical models", "Generic models"],
622
+ interactive=True,
623
+ elem_id="filter-columns-type",
624
+ )
625
+ filter_columns_size = gr.CheckboxGroup(
626
+ label="Model sizes (in billions of parameters)",
627
+ choices=list(NUMERIC_INTERVALS.keys()),
628
+ value=list(NUMERIC_INTERVALS.keys()),
629
+ interactive=True,
630
+ elem_id="filter-columns-size",
631
+ )
632
+
633
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="med_safety")
634
+
635
+ leaderboard_table = gr.components.Dataframe(
636
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
637
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
638
+ datatype=TYPES,
639
+ elem_id="leaderboard-table",
640
+ interactive=False,
641
+ visible=True,
642
+ )
643
+
644
+ # Dummy leaderboard for handling the case when the user uses backspace key
645
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
646
+ value=datasets_original_df[MED_SAFETY_COLS],
647
+ headers=MED_SAFETY_COLS,
648
+ datatype=TYPES,
649
+ visible=False,
650
+ )
651
+
652
+
653
+ search_bar.submit(
654
+ update_table,
655
+ [
656
+ hidden_leaderboard_table_for_search,
657
+ shown_columns,
658
+ search_bar,
659
+ filter_columns_type,
660
+ filter_domain_specific,
661
+ filter_columns_size
662
+ # filter_columns_architecture
663
+ ],
664
+ leaderboard_table,
665
+ )
666
+ for selector in [
667
+ shown_columns,
668
+ filter_columns_type,
669
+ filter_domain_specific,
670
+ filter_columns_size,
671
+ # deleted_models_visibility,
672
+ ]:
673
+ selector.change(
674
+ update_table,
675
+ [
676
+ hidden_leaderboard_table_for_search,
677
+ shown_columns,
678
+ search_bar,
679
+ filter_columns_type,
680
+ filter_domain_specific,
681
+ filter_columns_size
682
+ ],
683
+ leaderboard_table,
684
+ queue=True,
685
+ )
686
+ with gr.Accordion("💬 Generation templates", open=False):
687
+ with gr.Accordion("Response generation", open=False):
688
+ system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="response_generation")
689
+ with gr.Accordion("Scoring Rubric", open=False):
690
+ system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="scoring_rubric")
691
+ with gr.TabItem("🏅 Medical Summarization", elem_id="llm-benchmark-tab-table", id=3):
692
+ gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
693
+ with gr.Row():
694
+ with gr.Column():
695
+ with gr.Row():
696
+ search_bar = gr.Textbox(
697
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
698
+ show_label=False,
699
+ elem_id="search-bar",
700
+ )
701
+ with gr.Row():
702
+ shown_columns = gr.CheckboxGroup(
703
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)],
704
+ value=[
705
+ c.name
706
+ for c in fields(AutoEvalColumn)
707
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)
708
+ ],
709
+ label="Select columns to show",
710
+ elem_id="column-select",
711
+ interactive=True,
712
+ )
713
+ # with gr.Row():
714
+ # deleted_models_visibility = gr.Checkbox(
715
+ # value=False, label="Show gated/private/deleted models", interactive=True
716
+ # )
717
+ with gr.Column(min_width=320):
718
+ # with gr.Box(elem_id="box-filter"):
719
+ filter_columns_type = gr.CheckboxGroup(
720
+ label="Model Types",
721
+ choices=[t.to_str() for t in ModelType],
722
+ value=[t.to_str() for t in ModelType],
723
+ interactive=True,
724
+ elem_id="filter-columns-type",
725
+ )
726
+ # filter_columns_architecture = gr.CheckboxGroup(
727
+ # label="Architecture Types",
728
+ # choices=[i.value.name for i in ModelArch],
729
+ # value=[i.value.name for i in ModelArch],
730
+ # interactive=True,
731
+ # elem_id="filter-columns-architecture",
732
+ # )
733
+ filter_domain_specific = gr.CheckboxGroup(
734
+ label="Domain Specificity",
735
+ choices=["🏥 Clinical models", "Generic models"],
736
+ value=["🏥 Clinical models", "Generic models"],
737
+ interactive=True,
738
+ elem_id="filter-columns-type",
739
+ )
740
+ filter_columns_size = gr.CheckboxGroup(
741
+ label="Model sizes (in billions of parameters)",
742
+ choices=list(NUMERIC_INTERVALS.keys()),
743
+ value=list(NUMERIC_INTERVALS.keys()),
744
+ interactive=True,
745
+ elem_id="filter-columns-size",
746
+ )
747
+
748
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="medical_summarization")
749
+
750
+ leaderboard_table = gr.components.Dataframe(
751
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
752
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
753
+ datatype=TYPES,
754
+ elem_id="leaderboard-table",
755
+ interactive=False,
756
+ visible=True,
757
+ )
758
+
759
+ # Dummy leaderboard for handling the case when the user uses backspace key
760
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
761
+ value=datasets_original_df[MEDICAL_SUMMARIZATION_COLS],
762
+ headers=MEDICAL_SUMMARIZATION_COLS,
763
+ datatype=TYPES,
764
+ visible=False,
765
+ )
766
+
767
+
768
+ search_bar.submit(
769
+ update_table,
770
+ [
771
+ hidden_leaderboard_table_for_search,
772
+ shown_columns,
773
+ search_bar,
774
+ filter_columns_type,
775
+ filter_domain_specific,
776
+ filter_columns_size
777
+ # filter_columns_architecture
778
+ ],
779
+ leaderboard_table,
780
+ )
781
+ for selector in [
782
+ shown_columns,
783
+ filter_columns_type,
784
+ filter_domain_specific,
785
+ filter_columns_size,
786
+ # deleted_models_visibility,
787
+ ]:
788
+ selector.change(
789
+ update_table,
790
+ [
791
+ hidden_leaderboard_table_for_search,
792
+ shown_columns,
793
+ search_bar,
794
+ filter_columns_type,
795
+ filter_domain_specific,
796
+ filter_columns_size
797
+ ],
798
+ leaderboard_table,
799
+ queue=True,
800
+ )
801
+ with gr.Accordion("💬 Generation templates", open=False):
802
+ with gr.Accordion("Response generation", open=False):
803
+ system_prompt, user_prompt = render_generation_templates(task="medical_summarization", generation_type="response_generation")
804
+ with gr.Accordion("Question generation", open=False):
805
+ system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
806
+ with gr.Accordion("Cross Examination", open=False):
807
+ system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
808
+ with gr.TabItem("🏅 Note generation", elem_id="llm-benchmark-tab-table", id=4):
809
+ gr.Markdown(NOTE_GENERATION_METRICS, elem_classes="markdown-text")
810
+ with gr.Tabs(elem_classes="tab-buttons2") as tabs:
811
+ with gr.TabItem("ACI Bench", elem_id="llm-benchmark-tab-table2", id=0):
812
+ with gr.Row():
813
+ with gr.Column():
814
+ with gr.Row():
815
+ search_bar = gr.Textbox(
816
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
817
+ show_label=False,
818
+ elem_id="search-bar",
819
+ )
820
+ with gr.Row():
821
+ shown_columns = gr.CheckboxGroup(
822
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)],
823
+ value=[
824
+ c.name
825
+ for c in fields(AutoEvalColumn)
826
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)
827
+ ],
828
+ label="Select columns to show",
829
+ elem_id="column-select",
830
+ interactive=True,
831
+ )
832
+ # with gr.Row():
833
+ # deleted_models_visibility = gr.Checkbox(
834
+ # value=False, label="Show gated/private/deleted models", interactive=True
835
+ # )
836
+ with gr.Column(min_width=320):
837
+ # with gr.Box(elem_id="box-filter"):
838
+ filter_columns_type = gr.CheckboxGroup(
839
+ label="Model Types",
840
+ choices=[t.to_str() for t in ModelType],
841
+ value=[t.to_str() for t in ModelType],
842
+ interactive=True,
843
+ elem_id="filter-columns-type",
844
+ )
845
+ # filter_columns_architecture = gr.CheckboxGroup(
846
+ # label="Architecture Types",
847
+ # choices=[i.value.name for i in ModelArch],
848
+ # value=[i.value.name for i in ModelArch],
849
+ # interactive=True,
850
+ # elem_id="filter-columns-architecture",
851
+ # )
852
+ filter_domain_specific = gr.CheckboxGroup(
853
+ label="Domain Specificity",
854
+ choices=["🏥 Clinical models", "Generic models"],
855
+ value=["🏥 Clinical models", "Generic models"],
856
+ interactive=True,
857
+ elem_id="filter-columns-type",
858
+ )
859
+ filter_columns_size = gr.CheckboxGroup(
860
+ label="Model sizes (in billions of parameters)",
861
+ choices=list(NUMERIC_INTERVALS.keys()),
862
+ value=list(NUMERIC_INTERVALS.keys()),
863
+ interactive=True,
864
+ elem_id="filter-columns-size",
865
+ )
866
+
867
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="aci")
868
+
869
+ leaderboard_table = gr.components.Dataframe(
870
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
871
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
872
+ datatype=TYPES,
873
+ elem_id="leaderboard-table",
874
+ interactive=False,
875
+ visible=True,
876
+ )
877
+
878
+ # Dummy leaderboard for handling the case when the user uses backspace key
879
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
880
+ value=datasets_original_df[ACI_COLS],
881
+ headers=ACI_COLS,
882
+ datatype=TYPES,
883
+ visible=False,
884
+ )
885
+
886
+
887
+ search_bar.submit(
888
+ update_table,
889
+ [
890
+ hidden_leaderboard_table_for_search,
891
+ shown_columns,
892
+ search_bar,
893
+ filter_columns_type,
894
+ filter_domain_specific,
895
+ filter_columns_size
896
+ # filter_columns_architecture
897
+ ],
898
+ leaderboard_table,
899
+ )
900
+ for selector in [
901
+ shown_columns,
902
+ filter_columns_type,
903
+ filter_domain_specific,
904
+ filter_columns_size,
905
+ # deleted_models_visibility,
906
+ ]:
907
+ selector.change(
908
+ update_table,
909
+ [
910
+ hidden_leaderboard_table_for_search,
911
+ shown_columns,
912
+ search_bar,
913
+ filter_columns_type,
914
+ filter_domain_specific,
915
+ filter_columns_size
916
+ ],
917
+ leaderboard_table,
918
+ queue=True,
919
+ )
920
+ with gr.TabItem("SOAP Notes", elem_id="llm-benchmark-tab-table2", id=1):
921
+ with gr.Row():
922
+ with gr.Column():
923
+ with gr.Row():
924
+ search_bar = gr.Textbox(
925
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
926
+ show_label=False,
927
+ elem_id="search-bar",
928
+ )
929
+ with gr.Row():
930
+ shown_columns = gr.CheckboxGroup(
931
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)],
932
+ value=[
933
+ c.name
934
+ for c in fields(AutoEvalColumn)
935
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)
936
+ ],
937
+ label="Select columns to show",
938
+ elem_id="column-select",
939
+ interactive=True,
940
+ )
941
+ # with gr.Row():
942
+ # deleted_models_visibility = gr.Checkbox(
943
+ # value=False, label="Show gated/private/deleted models", interactive=True
944
+ # )
945
+ with gr.Column(min_width=320):
946
+ # with gr.Box(elem_id="box-filter"):
947
+ filter_columns_type = gr.CheckboxGroup(
948
+ label="Model Types",
949
+ choices=[t.to_str() for t in ModelType],
950
+ value=[t.to_str() for t in ModelType],
951
+ interactive=True,
952
+ elem_id="filter-columns-type",
953
+ )
954
+ # filter_columns_architecture = gr.CheckboxGroup(
955
+ # label="Architecture Types",
956
+ # choices=[i.value.name for i in ModelArch],
957
+ # value=[i.value.name for i in ModelArch],
958
+ # interactive=True,
959
+ # elem_id="filter-columns-architecture",
960
+ # )
961
+ filter_domain_specific = gr.CheckboxGroup(
962
+ label="Domain Specificity",
963
+ choices=["🏥 Clinical models", "Generic models"],
964
+ value=["🏥 Clinical models", "Generic models"],
965
+ interactive=True,
966
+ elem_id="filter-columns-type",
967
+ )
968
+ filter_columns_size = gr.CheckboxGroup(
969
+ label="Model sizes (in billions of parameters)",
970
+ choices=list(NUMERIC_INTERVALS.keys()),
971
+ value=list(NUMERIC_INTERVALS.keys()),
972
+ interactive=True,
973
+ elem_id="filter-columns-size",
974
+ )
975
+
976
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="soap")
977
+
978
+ leaderboard_table = gr.components.Dataframe(
979
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
980
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
981
+ datatype=TYPES,
982
+ elem_id="leaderboard-table",
983
+ interactive=False,
984
+ visible=True,
985
+ )
986
+
987
+ # Dummy leaderboard for handling the case when the user uses backspace key
988
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
989
+ value=datasets_original_df[SOAP_COLS],
990
+ headers=SOAP_COLS,
991
+ datatype=TYPES,
992
+ visible=False,
993
+ )
994
+
995
+
996
+ search_bar.submit(
997
+ update_table,
998
+ [
999
+ hidden_leaderboard_table_for_search,
1000
+ shown_columns,
1001
+ search_bar,
1002
+ filter_columns_type,
1003
+ filter_domain_specific,
1004
+ filter_columns_size
1005
+ # filter_columns_architecture
1006
+ ],
1007
+ leaderboard_table,
1008
+ )
1009
+ for selector in [
1010
+ shown_columns,
1011
+ filter_columns_type,
1012
+ filter_domain_specific,
1013
+ filter_columns_size,
1014
+ # deleted_models_visibility,
1015
+ ]:
1016
+ selector.change(
1017
+ update_table,
1018
+ [
1019
+ hidden_leaderboard_table_for_search,
1020
+ shown_columns,
1021
+ search_bar,
1022
+ filter_columns_type,
1023
+ filter_domain_specific,
1024
+ filter_columns_size
1025
+ ],
1026
+ leaderboard_table,
1027
+ queue=True,
1028
+ )
1029
+ with gr.Accordion("💬 Generation templates", open=False):
1030
+ with gr.Accordion("ACI-Bench Response generation", open=False):
1031
+ system_prompt, user_prompt = render_generation_templates(task="aci", generation_type="response_generation")
1032
+ with gr.Accordion("SOAP Notes Response generation", open=False):
1033
+ system_prompt, user_prompt = render_generation_templates(task="soap", generation_type="response_generation")
1034
+ with gr.Accordion("Question generation", open=False):
1035
+ system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
1036
+ with gr.Accordion("Cross Examination", open=False):
1037
+ system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
1038
+ with gr.TabItem("🏅 Closed Ended Evaluation", elem_id="llm-benchmark-tab-table", id=6):
1039
+ with gr.Tabs(elem_classes="tab-buttons2") as tabs:
1040
+ with gr.TabItem("English", elem_id="llm-benchmark-tab-table9", id=0):
1041
+ with gr.Row():
1042
+ with gr.Column():
1043
+ with gr.Row():
1044
+ search_bar = gr.Textbox(
1045
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
1046
+ show_label=False,
1047
+ elem_id="search-bar",
1048
+ )
1049
+ with gr.Row():
1050
+ shown_columns = gr.CheckboxGroup(
1051
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
1052
+ value=[
1053
+ c.name
1054
+ for c in fields(AutoEvalColumn)
1055
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)
1056
+ ],
1057
+ label="Select columns to show",
1058
+ elem_id="column-select",
1059
+ interactive=True,
1060
+ )
1061
+ # with gr.Row():
1062
+ # deleted_models_visibility = gr.Checkbox(
1063
+ # value=False, label="Show gated/private/deleted models", interactive=True
1064
+ # )
1065
+ with gr.Column(min_width=320):
1066
+ # with gr.Box(elem_id="box-filter"):
1067
+ filter_columns_type = gr.CheckboxGroup(
1068
+ label="Model Types",
1069
+ choices=[t.to_str() for t in ModelType],
1070
+ value=[t.to_str() for t in ModelType],
1071
+ interactive=True,
1072
+ elem_id="filter-columns-type",
1073
+ )
1074
+ # filter_columns_architecture = gr.CheckboxGroup(
1075
+ # label="Architecture Types",
1076
+ # choices=[i.value.name for i in ModelArch],
1077
+ # value=[i.value.name for i in ModelArch],
1078
+ # interactive=True,
1079
+ # elem_id="filter-columns-architecture",
1080
+ # )
1081
+ filter_domain_specific = gr.CheckboxGroup(
1082
+ label="Domain Specificity",
1083
+ choices=["🏥 Clinical models", "Generic models"],
1084
+ value=["🏥 Clinical models", "Generic models"],
1085
+ interactive=True,
1086
+ elem_id="filter-columns-type",
1087
+ )
1088
+ filter_columns_size = gr.CheckboxGroup(
1089
+ label="Model sizes (in billions of parameters)",
1090
+ choices=list(NUMERIC_INTERVALS.keys()),
1091
+ value=list(NUMERIC_INTERVALS.keys()),
1092
+ interactive=True,
1093
+ elem_id="filter-columns-size",
1094
+ )
1095
+
1096
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
1097
+ leaderboard_table = gr.components.Dataframe(
1098
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1099
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1100
+ datatype=TYPES,
1101
+ elem_id="leaderboard-table",
1102
+ interactive=False,
1103
+ visible=True,
1104
+ )
1105
+
1106
+ # Dummy leaderboard for handling the case when the user uses backspace key
1107
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
1108
+ value=datasets_original_df[DATASET_COLS],
1109
+ headers=DATASET_COLS,
1110
+ datatype=TYPES,
1111
+ visible=False,
1112
+ )
1113
+
1114
+ search_bar.submit(
1115
+ update_table,
1116
+ [
1117
+ hidden_leaderboard_table_for_search,
1118
+ shown_columns,
1119
+ search_bar,
1120
+ filter_columns_type,
1121
+ filter_domain_specific,
1122
+ filter_columns_size
1123
+ # filter_columns_architecture
1124
+ ],
1125
+ leaderboard_table,
1126
+ )
1127
+ for selector in [
1128
+ shown_columns,
1129
+ filter_columns_type,
1130
+ filter_domain_specific,
1131
+ # filter_columns_architecture,
1132
+ filter_columns_size,
1133
+ # deleted_models_visibility,
1134
+ ]:
1135
+ selector.change(
1136
+ update_table,
1137
+ [
1138
+ hidden_leaderboard_table_for_search,
1139
+ shown_columns,
1140
+ search_bar,
1141
+ filter_columns_type,
1142
+ filter_domain_specific,
1143
+ filter_columns_size
1144
+ # filter_columns_architecture,
1145
+ ],
1146
+ leaderboard_table,
1147
+ queue=True,
1148
+ )
1149
+
1150
+ with gr.TabItem("🌍 Multilingual", elem_id="llm-benchmark-tab-table9", id=1):
1151
+ with gr.Row():
1152
+ with gr.Column():
1153
+ with gr.Row():
1154
+ search_bar = gr.Textbox(
1155
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
1156
+ show_label=False,
1157
+ elem_id="search-bar",
1158
+ )
1159
+ with gr.Row():
1160
+ shown_columns = gr.CheckboxGroup(
1161
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)],
1162
+ value=[
1163
+ c.name
1164
+ for c in fields(AutoEvalColumn)
1165
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)
1166
+ ],
1167
+ label="Select columns to show",
1168
+ elem_id="column-select",
1169
+ interactive=True,
1170
+ )
1171
+ # with gr.Row():
1172
+ # deleted_models_visibility = gr.Checkbox(
1173
+ # value=False, label="Show gated/private/deleted models", interactive=True
1174
+ # )
1175
+ with gr.Column(min_width=320):
1176
+ # with gr.Box(elem_id="box-filter"):
1177
+ filter_columns_type = gr.CheckboxGroup(
1178
+ label="Model Types",
1179
+ choices=[t.to_str() for t in ModelType],
1180
+ value=[t.to_str() for t in ModelType],
1181
+ interactive=True,
1182
+ elem_id="filter-columns-type",
1183
+ )
1184
+ # filter_columns_architecture = gr.CheckboxGroup(
1185
+ # label="Architecture Types",
1186
+ # choices=[i.value.name for i in ModelArch],
1187
+ # value=[i.value.name for i in ModelArch],
1188
+ # interactive=True,
1189
+ # elem_id="filter-columns-architecture",
1190
+ # )
1191
+ filter_domain_specific = gr.CheckboxGroup(
1192
+ label="Domain Specificity",
1193
+ choices=["🏥 Clinical models", "Generic models"],
1194
+ value=["🏥 Clinical models", "Generic models"],
1195
+ interactive=True,
1196
+ elem_id="filter-columns-type",
1197
+ )
1198
+ filter_columns_size = gr.CheckboxGroup(
1199
+ label="Model sizes (in billions of parameters)",
1200
+ choices=list(NUMERIC_INTERVALS.keys()),
1201
+ value=list(NUMERIC_INTERVALS.keys()),
1202
+ interactive=True,
1203
+ elem_id="filter-columns-size",
1204
+ )
1205
+
1206
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="closed_ended_multilingual")
1207
+ leaderboard_table = gr.components.Dataframe(
1208
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1209
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1210
+ datatype=TYPES,
1211
+ elem_id="leaderboard-table",
1212
+ interactive=False,
1213
+ visible=True,
1214
+ )
1215
+
1216
+ # Dummy leaderboard for handling the case when the user uses backspace key
1217
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
1218
+ value=datasets_original_df[ClosedEndedMultilingual_COLS],
1219
+ headers=ClosedEndedMultilingual_COLS,
1220
+ datatype=TYPES,
1221
+ visible=False,
1222
+ )
1223
+
1224
+ search_bar.submit(
1225
+ update_table,
1226
+ [
1227
+ hidden_leaderboard_table_for_search,
1228
+ shown_columns,
1229
+ search_bar,
1230
+ filter_columns_type,
1231
+ filter_domain_specific,
1232
+ filter_columns_size
1233
+ # filter_columns_architecture
1234
+ ],
1235
+ leaderboard_table,
1236
+ )
1237
+ for selector in [
1238
+ shown_columns,
1239
+ filter_columns_type,
1240
+ filter_domain_specific,
1241
+ # filter_columns_architecture,
1242
+ filter_columns_size,
1243
+ # deleted_models_visibility,
1244
+ ]:
1245
+ selector.change(
1246
+ update_table,
1247
+ [
1248
+ hidden_leaderboard_table_for_search,
1249
+ shown_columns,
1250
+ search_bar,
1251
+ filter_columns_type,
1252
+ filter_domain_specific,
1253
+ filter_columns_size
1254
+ # filter_columns_architecture,
1255
+ ],
1256
+ leaderboard_table,
1257
+ queue=True,
1258
+ )
1259
+
1260
+
1261
+ with gr.Row():
1262
+ with gr.Accordion("📙 Citation", open=False):
1263
+ citation_button = gr.Textbox(
1264
+ value=CITATION_BUTTON_TEXT,
1265
+ label=CITATION_BUTTON_LABEL,
1266
+ lines=20,
1267
+ elem_id="citation-button",
1268
+ show_copy_button=True,
1269
+ )
1270
+
1271
+
1272
+
1273
+ scheduler = BackgroundScheduler()
1274
+ scheduler.add_job(restart_space, "interval", seconds=1800)
1275
+ scheduler.start()
1276
+ demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'])
src/about.py CHANGED
@@ -40,6 +40,77 @@ class OpenEndedColumns(Enum):
40
  column3 = OpenEndedColumn("Score_intervals", "score", "Score 95% CI")
41
  # changes to be made here
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  @dataclass
44
  class MedSafetyColumn:
45
  benchmark: str
@@ -102,11 +173,16 @@ class ClosedEndedArabicColumn:
102
  metric: str
103
  col_name: str
104
 
105
- class ClosedEndedArabicColumns(Enum):
106
- arabictask0 = ClosedEndedArabicColumn("MMLU-Arabic", "accuracy", "MMLU-Arabic")
107
- arabictask2 = ClosedEndedArabicColumn("MedMCQA-Arabic", "accuracy", "MedMCQA-Arabic")
108
- arabictask3 = ClosedEndedArabicColumn("MedQA-Arabic", "accuracy", "MedQA-Arabic")
109
- arabictask5 = ClosedEndedArabicColumn("PubMedQA-Arabic", "accuracy", "PubMedQA-Arabic")
 
 
 
 
 
110
 
111
 
112
  NUM_FEWSHOT = 0 # Change with your few shot
 
40
  column3 = OpenEndedColumn("Score_intervals", "score", "Score 95% CI")
41
  # changes to be made here
42
 
43
+
44
+ @dataclass
45
+ class OpenEndedMultilingualColumn:
46
+ benchmark: str
47
+ metric: str
48
+ col_name: str
49
+
50
+ class OpenEndedArabicColumn(Enum):
51
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
52
+ arabic_column0 = OpenEndedMultilingualColumn("ELO", "score", "ELO")
53
+ arabic_column1 = OpenEndedMultilingualColumn("ELO_intervals", "score", "ELO 95% CI")
54
+ arabic_column2 = OpenEndedMultilingualColumn("Score", "score", "Score")
55
+ arabic_column3 = OpenEndedMultilingualColumn("Score_intervals", "score", "Score 95% CI")
56
+
57
+
58
+ class OpenEndedFrenchColumn(Enum):
59
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
60
+ french_column0 = OpenEndedMultilingualColumn("ELO", "score", "ELO")
61
+ french_column1 = OpenEndedMultilingualColumn("ELO_intervals", "score", "ELO 95% CI")
62
+ french_column2 = OpenEndedMultilingualColumn("Score", "score", "Score")
63
+ french_column3 = OpenEndedMultilingualColumn("Score_intervals", "score", "Score 95% CI")
64
+
65
+
66
+ class OpenEndedSpanishColumn(Enum):
67
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
68
+ spanish_column0 = OpenEndedMultilingualColumn("ELO", "score", "ELO")
69
+ spanish_column1 = OpenEndedMultilingualColumn("ELO_intervals", "score", "ELO 95% CI")
70
+ spanish_column2 = OpenEndedMultilingualColumn("Score", "score", "Score")
71
+ spanish_column3 = OpenEndedMultilingualColumn("Score_intervals", "score", "Score 95% CI")
72
+
73
+
74
+ class OpenEndedPortugueseColumn(Enum):
75
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
76
+ porto_column0 = OpenEndedMultilingualColumn("ELO", "score", "ELO")
77
+ porto_column1 = OpenEndedMultilingualColumn("ELO_intervals", "score", "ELO 95% CI")
78
+ porto_column2 = OpenEndedMultilingualColumn("Score", "score", "Score")
79
+ porto_column3 = OpenEndedMultilingualColumn("Score_intervals", "score", "Score 95% CI")
80
+
81
+
82
+ class OpenEndedRomanianColumn(Enum):
83
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
84
+ rom_column0 = OpenEndedMultilingualColumn("ELO", "score", "ELO")
85
+ rom_column1 = OpenEndedMultilingualColumn("ELO_intervals", "score", "ELO 95% CI")
86
+ rom_column2 = OpenEndedMultilingualColumn("Score", "score", "Score")
87
+ rom_column3 = OpenEndedMultilingualColumn("Score_intervals", "score", "Score 95% CI")
88
+
89
+
90
+ class OpenEndedGreekColumn(Enum):
91
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
92
+ greek_column0 = OpenEndedMultilingualColumn("ELO", "score", "ELO")
93
+ greek_column1 = OpenEndedMultilingualColumn("ELO_intervals", "score", "ELO 95% CI")
94
+ greek_column2 = OpenEndedMultilingualColumn("Score", "score", "Score")
95
+ greek_column3 = OpenEndedMultilingualColumn("Score_intervals", "score", "Score 95% CI")
96
+
97
+
98
+
99
+ @dataclass
100
+ class ClosedEndedMultilingualColumn:
101
+ benchmark: str
102
+ metric: str
103
+ col_name: str
104
+
105
+
106
+ class ClosedEndedMultilingualColumns(Enum):
107
+ mtask0 = ClosedEndedMultilingualColumn("Global-MMLU-Arabic", "accuracy", "🇦🇪Arabic")
108
+ mtask1 = ClosedEndedMultilingualColumn("Global-MMLU-French", "accuracy", "🇫🇷French")
109
+ mtask2 = ClosedEndedMultilingualColumn("Global-MMLU-Spanish", "accuracy", "🇪🇸Spanish")
110
+ mtask3 = ClosedEndedMultilingualColumn("Global-MMLU-Portuguese", "accuracy", "🇵🇹Portuguese")
111
+ mtask4 = ClosedEndedMultilingualColumn("Global-MMLU-Romanian", "accuracy", "🇷🇴Romanian")
112
+ mtask5 = ClosedEndedMultilingualColumn("Global-MMLU-Greek", "accuracy", "🇬🇷Greek")
113
+
114
  @dataclass
115
  class MedSafetyColumn:
116
  benchmark: str
 
173
  metric: str
174
  col_name: str
175
 
176
+
177
+
178
+
179
+
180
+
181
+ # class ClosedEndedArabicColumns(Enum):
182
+ # arabictask0 = ClosedEndedArabicColumn("MMLU-Arabic", "accuracy", "MMLU-Arabic")
183
+ # arabictask2 = ClosedEndedArabicColumn("MedMCQA-Arabic", "accuracy", "MedMCQA-Arabic")
184
+ # arabictask3 = ClosedEndedArabicColumn("MedQA-Arabic", "accuracy", "MedQA-Arabic")
185
+ # arabictask5 = ClosedEndedArabicColumn("PubMedQA-Arabic", "accuracy", "PubMedQA-Arabic")
186
 
187
 
188
  NUM_FEWSHOT = 0 # Change with your few shot
src/display/utils.py CHANGED
@@ -4,7 +4,7 @@ from enum import Enum
4
  import pandas as pd
5
 
6
  # changes to be made here
7
- from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, ClosedEndedArabicColumns
8
  from src.envs import PRIVATE_REPO
9
  import json
10
  import gradio as gr
@@ -31,17 +31,21 @@ class ColumnContent:
31
  medical_summarization_col: bool = False
32
  aci_col: bool = False
33
  soap_col: bool = False
34
- closed_ended_arabic_col: bool = False
 
 
 
 
 
 
35
 
36
 
37
- ## Leaderboard columns
38
- auto_eval_column_dict = []
39
  # Init
40
  auto_eval_column_dict = []
41
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
42
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
43
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
44
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, closed_ended_arabic_col=True, invariant=False)])
45
  auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
46
  for task in HarnessTasks:
47
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
@@ -59,9 +63,21 @@ for column in ACIColumns:
59
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, aci_col=True, invariant=False)])
60
  for column in SOAPColumns:
61
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, soap_col=True, invariant=False)])
62
- # if PRIVATE_REPO:
63
- for column in ClosedEndedArabicColumns:
64
- auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, closed_ended_arabic_col=True, invariant=False)])
 
 
 
 
 
 
 
 
 
 
 
 
65
  auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
66
  auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
67
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
@@ -75,6 +91,13 @@ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Avai
75
  # auto_eval_column_dict.append(["display_result", ColumnContent, ColumnContent("Display Result", "bool", False, True)])
76
  auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("Submission Date", "str", False)])
77
 
 
 
 
 
 
 
 
78
  # We use make dataclass to dynamically fill the scores from Tasks
79
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
80
 
@@ -94,8 +117,8 @@ class EvalQueueColumn: # Queue column
94
  med_safety_status = ColumnContent("med_safety_status", "str", True)
95
  medical_summarization_status = ColumnContent("medical_summarization_status", "str", True)
96
  note_generation_status = ColumnContent("note_generation_status", "str", True)
97
- if PRIVATE_REPO:
98
- closed_ended_arabic_status = ColumnContent("closed_ended_arabic_status", "str", True)
99
 
100
  ## All the model information that we might need
101
  @dataclass
@@ -221,8 +244,22 @@ MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c
221
  MEDICAL_SUMMARIZATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.medical_summarization_col or c.invariant)]
222
  ACI_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.aci_col or c.invariant)]
223
  SOAP_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.soap_col or c.invariant)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  # if PRIVATE_REPO:
225
- CLOSED_ENDED_ARABIC_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.closed_ended_arabic_col or c.invariant)]
226
  # CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.cross_examination_col or c.invariant)]
227
  # DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
228
  # OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
@@ -243,8 +280,26 @@ MED_SAFETY_BENCHMARK_COLS = [t.value.col_name for t in MedSafetyColumns]
243
  MEDICAL_SUMMARIZATION_BENCHMARK_COLS = [t.value.col_name for t in MedicalSummarizationColumns]
244
  ACI_BENCHMARK_COLS = [t.value.col_name for t in ACIColumns]
245
  SOAP_BENCHMARK_COLS = [t.value.col_name for t in SOAPColumns]
246
- # if PRIVATE_REPO:
247
- CLOSED_ENDED_ARABIC_BENCHMARK_COLS = [t.value.col_name for t in ClosedEndedArabicColumns]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  # CROSS_EXAMINATION_BENCHMARK_COLS = [t.value.col_name for t in CrossExaminationTasks]
249
 
250
  NUMERIC_INTERVALS = {
 
4
  import pandas as pd
5
 
6
  # changes to be made here
7
+ from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, ClosedEndedMultilingualColumns, OpenEndedArabicColumn, OpenEndedFrenchColumn, OpenEndedSpanishColumn, OpenEndedPortugueseColumn, OpenEndedRomanianColumn, OpenEndedGreekColumn
8
  from src.envs import PRIVATE_REPO
9
  import json
10
  import gradio as gr
 
31
  medical_summarization_col: bool = False
32
  aci_col: bool = False
33
  soap_col: bool = False
34
+ open_ended_arabic_col: bool = False
35
+ open_ended_french_col: bool = False
36
+ open_ended_spanish_col: bool = False
37
+ open_ended_portuguese_col: bool = False
38
+ open_ended_romanian_col: bool = False
39
+ open_ended_greek_col: bool = False
40
+ closed_ended_multilingual_col: bool = False
41
 
42
 
 
 
43
  # Init
44
  auto_eval_column_dict = []
45
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
46
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
47
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
48
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, closed_ended_multilingual_col=True, invariant=False)])
49
  auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
50
  for task in HarnessTasks:
51
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
 
63
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, aci_col=True, invariant=False)])
64
  for column in SOAPColumns:
65
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, soap_col=True, invariant=False)])
66
+ for column in OpenEndedArabicColumn:
67
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_arabic_col=True, invariant=False)])
68
+ for column in OpenEndedFrenchColumn:
69
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_french_col=True, invariant=False)])
70
+ for column in OpenEndedSpanishColumn:
71
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_spanish_col=True, invariant=False)])
72
+ for column in OpenEndedPortugueseColumn:
73
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_portuguese_col=True, invariant=False)])
74
+ for column in OpenEndedRomanianColumn:
75
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_romanian_col=True, invariant=False)])
76
+ for column in OpenEndedGreekColumn:
77
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_greek_col=True, invariant=False)])
78
+ for column in ClosedEndedMultilingualColumns:
79
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, closed_ended_multilingual_col=True, invariant=False)])
80
+
81
  auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
82
  auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
83
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
 
91
  # auto_eval_column_dict.append(["display_result", ColumnContent, ColumnContent("Display Result", "bool", False, True)])
92
  auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("Submission Date", "str", False)])
93
 
94
+ # from dataclasses import make_dataclass, field
95
+
96
+ # Example of fixing mutable defaults
97
+ # auto_eval_column_dict = {
98
+ # "example_field": field(default_factory=dict), # Replace mutable default
99
+ # "another_field": field(default_factory=list), # Replace mutable default
100
+ # }
101
  # We use make dataclass to dynamically fill the scores from Tasks
102
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
103
 
 
117
  med_safety_status = ColumnContent("med_safety_status", "str", True)
118
  medical_summarization_status = ColumnContent("medical_summarization_status", "str", True)
119
  note_generation_status = ColumnContent("note_generation_status", "str", True)
120
+ # if PRIVATE_REPO:
121
+ # closed_ended_arabic_status = ColumnContent("closed_ended_arabic_status", "str", True)
122
 
123
  ## All the model information that we might need
124
  @dataclass
 
244
  MEDICAL_SUMMARIZATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.medical_summarization_col or c.invariant)]
245
  ACI_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.aci_col or c.invariant)]
246
  SOAP_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.soap_col or c.invariant)]
247
+
248
+ OpenEndedArabic_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_arabic_col or c.invariant)]
249
+ OpenEndedFrench_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_french_col or c.invariant)]
250
+ OpenEndedSpanish_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_spanish_col or c.invariant)]
251
+ OpenEndedPortuguese_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_portuguese_col or c.invariant)]
252
+ OpenEndedRomanian_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_romanian_col or c.invariant)]
253
+ OpenEndedGreek_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_greek_col or c.invariant)]
254
+
255
+
256
+
257
+ ClosedEndedMultilingual_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.closed_ended_multilingual_col or c.invariant)]
258
+
259
+
260
+
261
  # if PRIVATE_REPO:
262
+ #CLOSED_ENDED_ARABIC_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.closed_ended_arabic_col or c.invariant)]
263
  # CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.cross_examination_col or c.invariant)]
264
  # DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
265
  # OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
 
280
  MEDICAL_SUMMARIZATION_BENCHMARK_COLS = [t.value.col_name for t in MedicalSummarizationColumns]
281
  ACI_BENCHMARK_COLS = [t.value.col_name for t in ACIColumns]
282
  SOAP_BENCHMARK_COLS = [t.value.col_name for t in SOAPColumns]
283
+
284
+
285
+ #changed this
286
+ OpenEndedArabic_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedArabicColumn]
287
+ OpenEndedFrench_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedFrenchColumn]
288
+ OpenEndedPortuguese_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedPortugueseColumn]
289
+ OpenEndedSpanish_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedSpanishColumn]
290
+ OpenEndedRomanian_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedRomanianColumn]
291
+ OpenEndedGreek_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedGreekColumn]
292
+
293
+
294
+ ClosedEndedMultilingual_BENCHMARK_COLS = [t.value.col_name for t in ClosedEndedMultilingualColumns]
295
+
296
+
297
+
298
+
299
+
300
+
301
+ # # if PRIVATE_REPO:
302
+ # CLOSED_ENDED_ARABIC_BENCHMARK_COLS = [t.value.col_name for t in ClosedEndedArabicColumns]
303
  # CROSS_EXAMINATION_BENCHMARK_COLS = [t.value.col_name for t in CrossExaminationTasks]
304
 
305
  NUMERIC_INTERVALS = {
src/leaderboard/instr.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ in about
2
+ from app, to read evals, to utils to about ( to define the tasks and the colums ( so for close-ended define the languages and for open-ended ( use the same code with 95%CI, Elo rating...)))
3
+ define a class for open-ended-multilingual ( 6 times for all) the and close-ended mulitlingual globalmmlu
4
+ 6 columns for open-ended and one different for multili
5
+
6
+ in utils:
7
+
8
+ i should define the columns for languages again ( here we dont care about the hidden parts but we need to define in the beginning )
9
+
10
+ in read_evals
11
+
12
+ definition of the results of the data frames, and the definition of the int
13
+
14
+ for the front end:
15
+
16
+ in the app.py,i should add the gr.tabitem for open-ended, follow the healthbench and add the languages same logic as "ALL"
src/leaderboard/read_evals.py CHANGED
@@ -9,7 +9,7 @@ import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
  # changes to be made here
12
- from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, ClosedEndedArabicColumns
13
  from src.submission.check_validity import is_model_on_hub
14
  from src.envs import PRIVATE_REPO
15
 
@@ -30,7 +30,13 @@ class EvalResult:
30
  medical_summarization_results: dict
31
  aci_results: dict
32
  soap_results: dict
33
- closed_ended_arabic_results: dict
 
 
 
 
 
 
34
  is_domain_specific: bool
35
  use_chat_template: bool
36
  # clinical_type_results:dict
@@ -108,7 +114,7 @@ class EvalResult:
108
  open_ended_results = {}
109
  if "open-ended" in data["results"]:
110
  for task in OpenEndedColumns:
111
- task = task.value
112
  # We average all scores of a given metric (not all metrics are present in all files)
113
  accs = data["results"]["open-ended"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended"]["overall"] else None
114
  open_ended_results[task.benchmark] = accs
@@ -167,20 +173,90 @@ class EvalResult:
167
  continue
168
  mean_acc = np.mean(accs) # * 100.0
169
  soap_results[task.benchmark] = mean_acc
170
- closed_ended_arabic_results = {}
171
- if PRIVATE_REPO and "closed-ended-arabic" in data["results"]:
172
- for task in ClosedEndedArabicColumns:
173
  task = task.value
174
  # We average all scores of a given metric (not all metrics are present in all files)
175
- try:
176
- accs = np.array([v.get(task.metric, None) for k, v in data["results"]["closed-ended-arabic"].items() if task.benchmark == k])
177
- except:
178
- # breakpoint()
179
- accs = np.array([])
180
- if accs.size == 0 or any([acc is None for acc in accs]):
181
- continue
182
- mean_acc = np.mean(accs) # * 100.0
183
- closed_ended_arabic_results[task.benchmark] = mean_acc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  # if open_ended_results == {} or med_safety_results == {} or medical_summarization_results == {} or aci_results == {} or soap_results == {}:
185
  # open_ended_results = {}
186
  # med_safety_results = {}
@@ -212,7 +288,13 @@ class EvalResult:
212
  medical_summarization_results=medical_summarization_results,
213
  aci_results=aci_results,
214
  soap_results=soap_results,
215
- closed_ended_arabic_results=closed_ended_arabic_results,
 
 
 
 
 
 
216
  is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
217
  use_chat_template=config.get("use_chat_template", False), # Assuming a default value
218
  precision=precision,
@@ -315,12 +397,42 @@ class EvalResult:
315
  for task in SOAPColumns:
316
  data_dict[task.value.col_name] = self.soap_results[task.value.benchmark]
317
  return data_dict
318
- if PRIVATE_REPO and subset == "closed_ended_arabic":
319
- average = sum([v for v in self.closed_ended_arabic_results.values() if v is not None]) / len(ClosedEndedArabicColumns)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  data_dict[AutoEvalColumn.average.name] = average
321
- if len(self.closed_ended_arabic_results) > 0:
322
- for task in ClosedEndedArabicColumns:
323
- data_dict[task.value.col_name] = self.closed_ended_arabic_results[task.value.benchmark]
324
  return data_dict
325
 
326
  def get_request_file_for_model(requests_path, model_name, precision):
 
9
 
10
  from src.display.formatting import make_clickable_model
11
  # changes to be made here
12
+ from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, ClosedEndedMultilingualColumns, OpenEndedArabicColumn, OpenEndedFrenchColumn, OpenEndedSpanishColumn, OpenEndedPortugueseColumn, OpenEndedRomanianColumn, OpenEndedGreekColumn
13
  from src.submission.check_validity import is_model_on_hub
14
  from src.envs import PRIVATE_REPO
15
 
 
30
  medical_summarization_results: dict
31
  aci_results: dict
32
  soap_results: dict
33
+ open_ended_arabic_results: dict
34
+ open_ended_french_results: dict
35
+ open_ended_spanish_results: dict
36
+ open_ended_portuguese_results: dict
37
+ open_ended_romanian_results: dict
38
+ open_ended_greek_results: dict
39
+ closed_ended_multilingual_results: dict
40
  is_domain_specific: bool
41
  use_chat_template: bool
42
  # clinical_type_results:dict
 
114
  open_ended_results = {}
115
  if "open-ended" in data["results"]:
116
  for task in OpenEndedColumns:
117
+ task = task.value
118
  # We average all scores of a given metric (not all metrics are present in all files)
119
  accs = data["results"]["open-ended"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended"]["overall"] else None
120
  open_ended_results[task.benchmark] = accs
 
173
  continue
174
  mean_acc = np.mean(accs) # * 100.0
175
  soap_results[task.benchmark] = mean_acc
176
+ open_ended_arabic_results = {}
177
+ if "open-ended-arabic" in data["results"]:
178
+ for task in OpenEndedArabicColumn:
179
  task = task.value
180
  # We average all scores of a given metric (not all metrics are present in all files)
181
+ accs = data["results"]["open-ended-arabic"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended-arabic"]["overall"] else None
182
+ open_ended_arabic_results[task.benchmark] = accs
183
+ if open_ended_arabic_results["ELO_intervals"] is not None and open_ended_arabic_results["Score_intervals"] is not None:
184
+ open_ended_arabic_results["ELO_intervals"] = "+" + str(open_ended_arabic_results["ELO_intervals"][1]) + "/-" + str(abs(float(open_ended_arabic_results["ELO_intervals"][0])))
185
+ open_ended_arabic_results["Score_intervals"] = "+" + str(open_ended_arabic_results["Score_intervals"][1]) + "/-" + str(abs(float(open_ended_arabic_results["Score_intervals"][0])))
186
+ open_ended_french_results = {}
187
+ if "open-ended-french" in data["results"]:
188
+ for task in OpenEndedFrenchColumn:
189
+ task = task.value
190
+ # We average all scores of a given metric (not all metrics are present in all files)
191
+ accs = data["results"]["open-ended-french"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended-french"]["overall"] else None
192
+ open_ended_french_results[task.benchmark] = accs
193
+ if open_ended_french_results["ELO_intervals"] is not None and open_ended_french_results["Score_intervals"] is not None:
194
+ open_ended_french_results["ELO_intervals"] = "+" + str(open_ended_french_results["ELO_intervals"][1]) + "/-" + str(abs(open_ended_french_results["ELO_intervals"][0]))
195
+ open_ended_french_results["Score_intervals"] = "+" + str(open_ended_french_results["Score_intervals"][1]) + "/-" + str(abs(open_ended_french_results["Score_intervals"][0]))
196
+ open_ended_spanish_results = {}
197
+ if "open-ended-spanish" in data["results"]:
198
+ for task in OpenEndedSpanishColumn:
199
+ task = task.value
200
+ # We average all scores of a given metric (not all metrics are present in all files)
201
+ accs = data["results"]["open-ended-spanish"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended-spanish"]["overall"] else None
202
+ open_ended_spanish_results[task.benchmark] = accs
203
+ if open_ended_spanish_results["ELO_intervals"] is not None and open_ended_spanish_results["Score_intervals"] is not None:
204
+ open_ended_spanish_results["ELO_intervals"] = "+" + str(open_ended_spanish_results["ELO_intervals"][1]) + "/-" + str(abs(open_ended_spanish_results["ELO_intervals"][0]))
205
+ open_ended_spanish_results["Score_intervals"] = "+" + str(open_ended_spanish_results["Score_intervals"][1]) + "/-" + str(abs(open_ended_spanish_results["Score_intervals"][0]))
206
+ open_ended_portuguese_results = {}
207
+ if "open-ended-portuguese" in data["results"]:
208
+ for task in OpenEndedPortugueseColumn:
209
+ task = task.value
210
+ # We average all scores of a given metric (not all metrics are present in all files)
211
+ accs = data["results"]["open-ended-portuguese"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended-portuguese"]["overall"] else None
212
+ open_ended_portuguese_results[task.benchmark] = accs
213
+ if open_ended_portuguese_results["ELO_intervals"] is not None and open_ended_portuguese_results["Score_intervals"] is not None:
214
+ open_ended_portuguese_results["ELO_intervals"] = "+" + str(open_ended_portuguese_results["ELO_intervals"][1]) + "/-" + str(abs(open_ended_portuguese_results["ELO_intervals"][0]))
215
+ open_ended_portuguese_results["Score_intervals"] = "+" + str(open_ended_portuguese_results["Score_intervals"][1]) + "/-" + str(abs(open_ended_portuguese_results["Score_intervals"][0]))
216
+ open_ended_romanian_results = {}
217
+ if "open-ended-romanian" in data["results"]:
218
+ for task in OpenEndedRomanianColumn:
219
+ task = task.value
220
+ # We average all scores of a given metric (not all metrics are present in all files)
221
+ accs = data["results"]["open-ended-romanian"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended-romanian"]["overall"] else None
222
+ open_ended_romanian_results[task.benchmark] = accs
223
+ if open_ended_romanian_results["ELO_intervals"] is not None and open_ended_romanian_results["Score_intervals"] is not None:
224
+ open_ended_romanian_results["ELO_intervals"] = "+" + str(open_ended_romanian_results["ELO_intervals"][1]) + "/-" + str(abs(open_ended_romanian_results["ELO_intervals"][0]))
225
+ open_ended_romanian_results["Score_intervals"] = "+" + str(open_ended_romanian_results["Score_intervals"][1]) + "/-" + str(abs(open_ended_romanian_results["Score_intervals"][0]))
226
+ open_ended_greek_results = {}
227
+ if "open-ended-greek" in data["results"]:
228
+ for task in OpenEndedGreekColumn:
229
+ task = task.value
230
+ # We average all scores of a given metric (not all metrics are present in all files)
231
+ accs = data["results"]["open-ended-greek"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended-greek"]["overall"] else None
232
+ open_ended_greek_results[task.benchmark] = accs
233
+ if open_ended_greek_results["ELO_intervals"] is not None and open_ended_greek_results["Score_intervals"] is not None:
234
+ open_ended_greek_results["ELO_intervals"] = "+" + str(open_ended_greek_results["ELO_intervals"][1]) + "/-" + str(abs(float(open_ended_greek_results["ELO_intervals"][0])))
235
+ open_ended_greek_results["Score_intervals"] = "+" + str(open_ended_greek_results["Score_intervals"][1]) + "/-" + str(abs(float(open_ended_greek_results["Score_intervals"][0])))
236
+ closed_ended_multilingual_results = {}
237
+ if "closed-ended-multilingual" in data["results"]:
238
+ for task in ClosedEndedMultilingualColumns:
239
+ task = task.value
240
+ accs = data["results"]["closed-ended-multilingual"][task.benchmark]["accuracy"] if task.benchmark in data["results"]["closed-ended-multilingual"] else None
241
+ closed_ended_multilingual_results[task.benchmark] = accs
242
+
243
+ # #add the
244
+ # closed_ended_arabic_results = {}
245
+ # if PRIVATE_REPO and "closed-ended-arabic" in data["results"]:
246
+ # for task in ClosedEndedArabicColumns:
247
+ # task = task.value
248
+ # # We average all scores of a given metric (not all metrics are present in all files)
249
+ # try:
250
+ # accs = np.array([v.get(task.metric, None) for k, v in data["results"]["closed-ended-arabic"].items() if task.benchmark == k])
251
+ # except:
252
+ # # breakpoint()
253
+ # accs = np.array([])
254
+ # if accs.size == 0 or any([acc is None for acc in accs]):
255
+ # continue
256
+ # mean_acc = np.mean(accs) # * 100.0
257
+ # closed_ended_arabic_results[task.benchmark] = mean_acc
258
+
259
+
260
  # if open_ended_results == {} or med_safety_results == {} or medical_summarization_results == {} or aci_results == {} or soap_results == {}:
261
  # open_ended_results = {}
262
  # med_safety_results = {}
 
288
  medical_summarization_results=medical_summarization_results,
289
  aci_results=aci_results,
290
  soap_results=soap_results,
291
+ open_ended_arabic_results=open_ended_arabic_results,
292
+ open_ended_french_results=open_ended_french_results,
293
+ open_ended_spanish_results=open_ended_spanish_results,
294
+ open_ended_portuguese_results=open_ended_portuguese_results,
295
+ open_ended_romanian_results=open_ended_romanian_results,
296
+ open_ended_greek_results=open_ended_greek_results,
297
+ closed_ended_multilingual_results=closed_ended_multilingual_results,
298
  is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
299
  use_chat_template=config.get("use_chat_template", False), # Assuming a default value
300
  precision=precision,
 
397
  for task in SOAPColumns:
398
  data_dict[task.value.col_name] = self.soap_results[task.value.benchmark]
399
  return data_dict
400
+ if subset == "open_ended_arabic":
401
+ if len(self.open_ended_arabic_results) > 0:
402
+ for task in OpenEndedArabicColumn:
403
+ data_dict[task.value.col_name] = self.open_ended_arabic_results[task.value.benchmark]
404
+ return data_dict
405
+ if subset == "open_ended_french":
406
+ if len(self.open_ended_french_results) > 0:
407
+ for task in OpenEndedFrenchColumn:
408
+ data_dict[task.value.col_name] = self.open_ended_french_results[task.value.benchmark]
409
+ return data_dict
410
+ if subset == "open_ended_spanish":
411
+ if len(self.open_ended_spanish_results) > 0:
412
+ for task in OpenEndedSpanishColumn:
413
+ data_dict[task.value.col_name] = self.open_ended_spanish_results[task.value.benchmark]
414
+ return data_dict
415
+ if subset == "open_ended_portuguese":
416
+ if len(self.open_ended_portuguese_results) > 0:
417
+ for task in OpenEndedPortugueseColumn:
418
+ data_dict[task.value.col_name] = self.open_ended_portuguese_results[task.value.benchmark]
419
+ return data_dict
420
+ if subset == "open_ended_romanian":
421
+ if len(self.open_ended_romanian_results) > 0:
422
+ for task in OpenEndedRomanianColumn:
423
+ data_dict[task.value.col_name] = self.open_ended_romanian_results[task.value.benchmark]
424
+ return data_dict
425
+ if subset == "open_ended_greek":
426
+ if len(self.open_ended_greek_results) > 0:
427
+ for task in OpenEndedGreekColumn:
428
+ data_dict[task.value.col_name] = self.open_ended_greek_results[task.value.benchmark]
429
+ return data_dict
430
+ if subset == "closed_ended_multilingual":
431
+ average = sum([v for v in self.closed_ended_multilingual_results.values() if v is not None]) / len(ClosedEndedMultilingualColumns)
432
  data_dict[AutoEvalColumn.average.name] = average
433
+ if len(self.closed_ended_multilingual_results) > 0:
434
+ for task in ClosedEndedMultilingualColumns:
435
+ data_dict[task.value.col_name] = self.closed_ended_multilingual_results[task.value.benchmark]
436
  return data_dict
437
 
438
  def get_request_file_for_model(requests_path, model_name, precision):
src/populate.py CHANGED
@@ -5,7 +5,7 @@ import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  # changes to be made here
8
- from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, ClosedEndedArabicColumns
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
  from src.envs import PRIVATE_REPO
11
 
@@ -16,14 +16,15 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
16
  # print(raw_data)
17
  # raise Exception("stop")
18
  all_data_json = [v.to_dict(subset=subset) for v in raw_data]
19
-
 
20
  df = pd.DataFrame.from_records(all_data_json)
21
  # changes to be made here
22
  if subset == "datasets":
23
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
24
  elif subset == "med_safety":
25
  df = df.sort_values(by=["Harmfulness Score"], ascending=True)
26
- elif subset == "open_ended":
27
  df = df.sort_values(by=["ELO"], ascending=False)
28
  elif subset == "medical_summarization":
29
  df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
@@ -31,7 +32,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
31
  df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
32
  elif subset == "soap":
33
  df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
34
- elif subset == "closed_ended_arabic":
35
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
36
  cols = list(set(df.columns).intersection(set(cols)))
37
  df = df[cols].round(decimals=2)
 
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  # changes to be made here
8
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, OpenEndedArabicColumn, OpenEndedFrenchColumn, OpenEndedSpanishColumn, OpenEndedPortugueseColumn, OpenEndedRomanianColumn, OpenEndedGreekColumn, ClosedEndedMultilingualColumns
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
  from src.envs import PRIVATE_REPO
11
 
 
16
  # print(raw_data)
17
  # raise Exception("stop")
18
  all_data_json = [v.to_dict(subset=subset) for v in raw_data]
19
+ # if subset.startswith("open_ended"):
20
+ # breakpoint()
21
  df = pd.DataFrame.from_records(all_data_json)
22
  # changes to be made here
23
  if subset == "datasets":
24
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
25
  elif subset == "med_safety":
26
  df = df.sort_values(by=["Harmfulness Score"], ascending=True)
27
+ elif subset.startswith("open_ended"):
28
  df = df.sort_values(by=["ELO"], ascending=False)
29
  elif subset == "medical_summarization":
30
  df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
 
32
  df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
33
  elif subset == "soap":
34
  df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
35
+ elif subset == "closed_ended_multilingual":
36
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
37
  cols = list(set(df.columns).intersection(set(cols)))
38
  df = df[cols].round(decimals=2)