tathagataraha commited on
Commit
553b217
·
1 Parent(s): 6c10fa6

[ADD] Cross-examination framework

Browse files
Files changed (5) hide show
  1. app.py +357 -7
  2. src/about.py +36 -0
  3. src/display/utils.py +19 -3
  4. src/leaderboard/read_evals.py +58 -9
  5. src/populate.py +13 -3
app.py CHANGED
@@ -24,9 +24,15 @@ from src.display.utils import (
24
  DATASET_BENCHMARK_COLS,
25
  OPEN_ENDED_BENCHMARK_COLS,
26
  MED_SAFETY_BENCHMARK_COLS,
 
 
 
27
  DATASET_COLS,
28
  OPEN_ENDED_COLS,
29
  MED_SAFETY_COLS,
 
 
 
30
  EVAL_COLS,
31
  EVAL_TYPES,
32
  NUMERIC_INTERVALS,
@@ -75,7 +81,15 @@ open_ended_leaderboard_df = open_ended_original_df.copy()
75
  _, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
76
  med_safety_leaderboard_df = med_safety_original_df.copy()
77
 
78
- # breakpoint()
 
 
 
 
 
 
 
 
79
  # breakpoint()
80
  # # Token based results
81
  # _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
@@ -91,7 +105,7 @@ med_safety_leaderboard_df = med_safety_original_df.copy()
91
  pending_eval_queue_df,
92
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
93
 
94
-
95
  def update_df(shown_columns, subset="datasets"):
96
  # changes to be made here
97
  if subset == "datasets":
@@ -103,6 +117,15 @@ def update_df(shown_columns, subset="datasets"):
103
  elif subset == "med_safety":
104
  leaderboard_table_df = med_safety_leaderboard_df.copy()
105
  hidden_leader_board_df = med_safety_original_df
 
 
 
 
 
 
 
 
 
106
  # else:
107
  # match evaluation_metric:
108
  # case "Span Based":
@@ -217,6 +240,7 @@ def filter_models(
217
 
218
  demo = gr.Blocks(css=custom_css)
219
  with demo:
 
220
  gr.HTML(TITLE)
221
  gr.HTML(LOGO, elem_classes="logo")
222
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
@@ -555,18 +579,344 @@ with demo:
555
  queue=True,
556
  )
557
 
558
- with gr.TabItem("🏅 Cross Examination", elem_id="llm-benchmark-tab-table", id=3):
559
- gr.Markdown("# Coming Soon!!!", elem_classes="markdown-text")
560
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561
 
562
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
564
  # gr.HTML(EVALUATION_EXAMPLE_IMG, elem_classes="logo")
565
  # gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
566
  # gr.HTML(ENTITY_DISTRIBUTION_IMG, elem_classes="logo")
567
  # gr.Markdown(LLM_BENCHMARKS_TEXT_3, elem_classes="markdown-text")
568
 
569
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=5):
570
  with gr.Column():
571
  with gr.Row():
572
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
24
  DATASET_BENCHMARK_COLS,
25
  OPEN_ENDED_BENCHMARK_COLS,
26
  MED_SAFETY_BENCHMARK_COLS,
27
+ MEDICAL_SUMMARIZATION_BENCHMARK_COLS,
28
+ ACI_BENCHMARK_COLS,
29
+ SOAP_BENCHMARK_COLS,
30
  DATASET_COLS,
31
  OPEN_ENDED_COLS,
32
  MED_SAFETY_COLS,
33
+ MEDICAL_SUMMARIZATION_COLS,
34
+ ACI_COLS,
35
+ SOAP_COLS,
36
  EVAL_COLS,
37
  EVAL_TYPES,
38
  NUMERIC_INTERVALS,
 
81
  _, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
82
  med_safety_leaderboard_df = med_safety_original_df.copy()
83
 
84
+ _, medical_summarization_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDICAL_SUMMARIZATION_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, "score", "medical_summarization")
85
+ medical_summarization_leaderboard_df = medical_summarization_original_df.copy()
86
+
87
+ _, aci_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ACI_COLS, ACI_BENCHMARK_COLS, "score", "aci")
88
+ aci_leaderboard_df = aci_original_df.copy()
89
+
90
+ _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
91
+ soap_leaderboard_df = soap_original_df.copy()
92
+
93
  # breakpoint()
94
  # # Token based results
95
  # _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
 
105
  pending_eval_queue_df,
106
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
107
 
108
+ # breakpoint()
109
  def update_df(shown_columns, subset="datasets"):
110
  # changes to be made here
111
  if subset == "datasets":
 
117
  elif subset == "med_safety":
118
  leaderboard_table_df = med_safety_leaderboard_df.copy()
119
  hidden_leader_board_df = med_safety_original_df
120
+ elif subset == "medical_summarization":
121
+ leaderboard_table_df = medical_summarization_leaderboard_df.copy()
122
+ hidden_leader_board_df = medical_summarization_original_df
123
+ elif subset == "aci":
124
+ leaderboard_table_df = aci_leaderboard_df.copy()
125
+ hidden_leader_board_df = aci_original_df
126
+ elif subset == "soap":
127
+ leaderboard_table_df = soap_leaderboard_df.copy()
128
+ hidden_leader_board_df = soap_original_df
129
  # else:
130
  # match evaluation_metric:
131
  # case "Span Based":
 
240
 
241
  demo = gr.Blocks(css=custom_css)
242
  with demo:
243
+ print("hello")
244
  gr.HTML(TITLE)
245
  gr.HTML(LOGO, elem_classes="logo")
246
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
579
  queue=True,
580
  )
581
 
582
+ with gr.TabItem("🏅 Medical Summarization", elem_id="llm-benchmark-tab-table", id=3):
583
+ with gr.Row():
584
+ with gr.Column():
585
+ with gr.Row():
586
+ search_bar = gr.Textbox(
587
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
588
+ show_label=False,
589
+ elem_id="search-bar",
590
+ )
591
+ with gr.Row():
592
+ shown_columns = gr.CheckboxGroup(
593
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)],
594
+ value=[
595
+ c.name
596
+ for c in fields(AutoEvalColumn)
597
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)
598
+ ],
599
+ label="Select columns to show",
600
+ elem_id="column-select",
601
+ interactive=True,
602
+ )
603
+ # with gr.Row():
604
+ # deleted_models_visibility = gr.Checkbox(
605
+ # value=False, label="Show gated/private/deleted models", interactive=True
606
+ # )
607
+ with gr.Column(min_width=320):
608
+ # with gr.Box(elem_id="box-filter"):
609
+ filter_columns_type = gr.CheckboxGroup(
610
+ label="Model Types",
611
+ choices=[t.to_str() for t in ModelType],
612
+ value=[t.to_str() for t in ModelType],
613
+ interactive=True,
614
+ elem_id="filter-columns-type",
615
+ )
616
+ # filter_columns_architecture = gr.CheckboxGroup(
617
+ # label="Architecture Types",
618
+ # choices=[i.value.name for i in ModelArch],
619
+ # value=[i.value.name for i in ModelArch],
620
+ # interactive=True,
621
+ # elem_id="filter-columns-architecture",
622
+ # )
623
+ filter_domain_specific = gr.CheckboxGroup(
624
+ label="Domain specific models",
625
+ choices=["Yes", "No"],
626
+ value=["Yes", "No"],
627
+ interactive=True,
628
+ elem_id="filter-columns-type",
629
+ )
630
+ filter_columns_size = gr.CheckboxGroup(
631
+ label="Model sizes (in billions of parameters)",
632
+ choices=list(NUMERIC_INTERVALS.keys()),
633
+ value=list(NUMERIC_INTERVALS.keys()),
634
+ interactive=True,
635
+ elem_id="filter-columns-size",
636
+ )
637
+
638
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="medical_summarization")
639
+
640
+ leaderboard_table = gr.components.Dataframe(
641
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
642
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
643
+ datatype=TYPES,
644
+ elem_id="leaderboard-table",
645
+ interactive=False,
646
+ visible=True,
647
+ )
648
 
649
+ # Dummy leaderboard for handling the case when the user uses backspace key
650
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
651
+ value=datasets_original_df[MEDICAL_SUMMARIZATION_COLS],
652
+ headers=MEDICAL_SUMMARIZATION_COLS,
653
+ datatype=TYPES,
654
+ visible=False,
655
+ )
656
+
657
+
658
+ search_bar.submit(
659
+ update_table,
660
+ [
661
+ hidden_leaderboard_table_for_search,
662
+ shown_columns,
663
+ search_bar,
664
+ filter_columns_type,
665
+ filter_domain_specific,
666
+ filter_columns_size
667
+ # filter_columns_architecture
668
+ ],
669
+ leaderboard_table,
670
+ )
671
+ for selector in [
672
+ shown_columns,
673
+ filter_columns_type,
674
+ filter_domain_specific,
675
+ filter_columns_size,
676
+ # deleted_models_visibility,
677
+ ]:
678
+ selector.change(
679
+ update_table,
680
+ [
681
+ hidden_leaderboard_table_for_search,
682
+ shown_columns,
683
+ search_bar,
684
+ filter_columns_type,
685
+ filter_domain_specific,
686
+ filter_columns_size
687
+ ],
688
+ leaderboard_table,
689
+ queue=True,
690
+ )
691
+ with gr.TabItem("🏅 Note generation", elem_id="llm-benchmark-tab-table", id=4):
692
+ with gr.Tabs(elem_classes="tab-buttons2") as tabs:
693
+ with gr.TabItem("ACI Bench", elem_id="llm-benchmark-tab-table2", id=0):
694
+ with gr.Row():
695
+ with gr.Column():
696
+ with gr.Row():
697
+ search_bar = gr.Textbox(
698
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
699
+ show_label=False,
700
+ elem_id="search-bar",
701
+ )
702
+ with gr.Row():
703
+ shown_columns = gr.CheckboxGroup(
704
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)],
705
+ value=[
706
+ c.name
707
+ for c in fields(AutoEvalColumn)
708
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)
709
+ ],
710
+ label="Select columns to show",
711
+ elem_id="column-select",
712
+ interactive=True,
713
+ )
714
+ # with gr.Row():
715
+ # deleted_models_visibility = gr.Checkbox(
716
+ # value=False, label="Show gated/private/deleted models", interactive=True
717
+ # )
718
+ with gr.Column(min_width=320):
719
+ # with gr.Box(elem_id="box-filter"):
720
+ filter_columns_type = gr.CheckboxGroup(
721
+ label="Model Types",
722
+ choices=[t.to_str() for t in ModelType],
723
+ value=[t.to_str() for t in ModelType],
724
+ interactive=True,
725
+ elem_id="filter-columns-type",
726
+ )
727
+ # filter_columns_architecture = gr.CheckboxGroup(
728
+ # label="Architecture Types",
729
+ # choices=[i.value.name for i in ModelArch],
730
+ # value=[i.value.name for i in ModelArch],
731
+ # interactive=True,
732
+ # elem_id="filter-columns-architecture",
733
+ # )
734
+ filter_domain_specific = gr.CheckboxGroup(
735
+ label="Domain specific models",
736
+ choices=["Yes", "No"],
737
+ value=["Yes", "No"],
738
+ interactive=True,
739
+ elem_id="filter-columns-type",
740
+ )
741
+ filter_columns_size = gr.CheckboxGroup(
742
+ label="Model sizes (in billions of parameters)",
743
+ choices=list(NUMERIC_INTERVALS.keys()),
744
+ value=list(NUMERIC_INTERVALS.keys()),
745
+ interactive=True,
746
+ elem_id="filter-columns-size",
747
+ )
748
+
749
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="aci")
750
+
751
+ leaderboard_table = gr.components.Dataframe(
752
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
753
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
754
+ datatype=TYPES,
755
+ elem_id="leaderboard-table",
756
+ interactive=False,
757
+ visible=True,
758
+ )
759
+
760
+ # Dummy leaderboard for handling the case when the user uses backspace key
761
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
762
+ value=datasets_original_df[ACI_COLS],
763
+ headers=ACI_COLS,
764
+ datatype=TYPES,
765
+ visible=False,
766
+ )
767
+
768
+
769
+ search_bar.submit(
770
+ update_table,
771
+ [
772
+ hidden_leaderboard_table_for_search,
773
+ shown_columns,
774
+ search_bar,
775
+ filter_columns_type,
776
+ filter_domain_specific,
777
+ filter_columns_size
778
+ # filter_columns_architecture
779
+ ],
780
+ leaderboard_table,
781
+ )
782
+ for selector in [
783
+ shown_columns,
784
+ filter_columns_type,
785
+ filter_domain_specific,
786
+ filter_columns_size,
787
+ # deleted_models_visibility,
788
+ ]:
789
+ selector.change(
790
+ update_table,
791
+ [
792
+ hidden_leaderboard_table_for_search,
793
+ shown_columns,
794
+ search_bar,
795
+ filter_columns_type,
796
+ filter_domain_specific,
797
+ filter_columns_size
798
+ ],
799
+ leaderboard_table,
800
+ queue=True,
801
+ )
802
+
803
+ with gr.TabItem("SOAP Notes", elem_id="llm-benchmark-tab-table2", id=1):
804
+ with gr.Row():
805
+ with gr.Column():
806
+ with gr.Row():
807
+ search_bar = gr.Textbox(
808
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
809
+ show_label=False,
810
+ elem_id="search-bar",
811
+ )
812
+ with gr.Row():
813
+ shown_columns = gr.CheckboxGroup(
814
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)],
815
+ value=[
816
+ c.name
817
+ for c in fields(AutoEvalColumn)
818
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)
819
+ ],
820
+ label="Select columns to show",
821
+ elem_id="column-select",
822
+ interactive=True,
823
+ )
824
+ # with gr.Row():
825
+ # deleted_models_visibility = gr.Checkbox(
826
+ # value=False, label="Show gated/private/deleted models", interactive=True
827
+ # )
828
+ with gr.Column(min_width=320):
829
+ # with gr.Box(elem_id="box-filter"):
830
+ filter_columns_type = gr.CheckboxGroup(
831
+ label="Model Types",
832
+ choices=[t.to_str() for t in ModelType],
833
+ value=[t.to_str() for t in ModelType],
834
+ interactive=True,
835
+ elem_id="filter-columns-type",
836
+ )
837
+ # filter_columns_architecture = gr.CheckboxGroup(
838
+ # label="Architecture Types",
839
+ # choices=[i.value.name for i in ModelArch],
840
+ # value=[i.value.name for i in ModelArch],
841
+ # interactive=True,
842
+ # elem_id="filter-columns-architecture",
843
+ # )
844
+ filter_domain_specific = gr.CheckboxGroup(
845
+ label="Domain specific models",
846
+ choices=["Yes", "No"],
847
+ value=["Yes", "No"],
848
+ interactive=True,
849
+ elem_id="filter-columns-type",
850
+ )
851
+ filter_columns_size = gr.CheckboxGroup(
852
+ label="Model sizes (in billions of parameters)",
853
+ choices=list(NUMERIC_INTERVALS.keys()),
854
+ value=list(NUMERIC_INTERVALS.keys()),
855
+ interactive=True,
856
+ elem_id="filter-columns-size",
857
+ )
858
+
859
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="soap")
860
+
861
+ leaderboard_table = gr.components.Dataframe(
862
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
863
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
864
+ datatype=TYPES,
865
+ elem_id="leaderboard-table",
866
+ interactive=False,
867
+ visible=True,
868
+ )
869
+
870
+ # Dummy leaderboard for handling the case when the user uses backspace key
871
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
872
+ value=datasets_original_df[SOAP_COLS],
873
+ headers=SOAP_COLS,
874
+ datatype=TYPES,
875
+ visible=False,
876
+ )
877
+
878
+
879
+ search_bar.submit(
880
+ update_table,
881
+ [
882
+ hidden_leaderboard_table_for_search,
883
+ shown_columns,
884
+ search_bar,
885
+ filter_columns_type,
886
+ filter_domain_specific,
887
+ filter_columns_size
888
+ # filter_columns_architecture
889
+ ],
890
+ leaderboard_table,
891
+ )
892
+ for selector in [
893
+ shown_columns,
894
+ filter_columns_type,
895
+ filter_domain_specific,
896
+ filter_columns_size,
897
+ # deleted_models_visibility,
898
+ ]:
899
+ selector.change(
900
+ update_table,
901
+ [
902
+ hidden_leaderboard_table_for_search,
903
+ shown_columns,
904
+ search_bar,
905
+ filter_columns_type,
906
+ filter_domain_specific,
907
+ filter_columns_size
908
+ ],
909
+ leaderboard_table,
910
+ queue=True,
911
+ )
912
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=5):
913
  gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
914
  # gr.HTML(EVALUATION_EXAMPLE_IMG, elem_classes="logo")
915
  # gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
916
  # gr.HTML(ENTITY_DISTRIBUTION_IMG, elem_classes="logo")
917
  # gr.Markdown(LLM_BENCHMARKS_TEXT_3, elem_classes="markdown-text")
918
 
919
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=6):
920
  with gr.Column():
921
  with gr.Row():
922
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
src/about.py CHANGED
@@ -57,6 +57,42 @@ class MedSafetyColumns(Enum):
57
  med_safety_column7 = MedSafetyColumn("Responsibility to Patient", "score", "Responsibility to Patient")
58
  med_safety_column8 = MedSafetyColumn("Law and Responsibility to Society", "score", "Law and Responsibility to Society")
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  NUM_FEWSHOT = 0 # Change with your few shot
61
  # ---------------------------------------------------
62
 
 
57
  med_safety_column7 = MedSafetyColumn("Responsibility to Patient", "score", "Responsibility to Patient")
58
  med_safety_column8 = MedSafetyColumn("Law and Responsibility to Society", "score", "Law and Responsibility to Society")
59
 
60
+ @dataclass
61
+ class MedicalSummarizationColumn:
62
+ benchmark: str
63
+ metric: str
64
+ col_name: str
65
+
66
+ class MedicalSummarizationColumns(Enum):
67
+ medical_summarization_column0 = MedicalSummarizationColumn("brief", "score", "Brief Score")
68
+ medical_summarization_column1 = MedicalSummarizationColumn("coverage", "score", "Coverage Score")
69
+ medical_summarization_column2 = MedicalSummarizationColumn("conform", "score", "Conform Score")
70
+ medical_summarization_column3 = MedicalSummarizationColumn("fact", "score", "Fact Score")
71
+
72
+ @dataclass
73
+ class ACIColumn:
74
+ benchmark: str
75
+ metric: str
76
+ col_name: str
77
+
78
+ class ACIColumns(Enum):
79
+ aci_column0 = ACIColumn("brief", "score", "Brief Score")
80
+ aci_column1 = ACIColumn("coverage", "score", "Coverage Score")
81
+ aci_column2 = ACIColumn("conform", "score", "Conform Score")
82
+ aci_column3 = ACIColumn("fact", "score", "Fact Score")
83
+
84
+ @dataclass
85
+ class SOAPColumn:
86
+ benchmark: str
87
+ metric: str
88
+ col_name: str
89
+
90
+ class SOAPColumns(Enum):
91
+ soap_column0 = SOAPColumn("brief", "score", "Brief Score")
92
+ soap_column1 = SOAPColumn("coverage", "score", "Coverage Score")
93
+ soap_column2 = SOAPColumn("conform", "score", "Conform Score")
94
+ soap_column3 = SOAPColumn("fact", "score", "Fact Score")
95
+
96
  NUM_FEWSHOT = 0 # Change with your few shot
97
  # ---------------------------------------------------
98
 
src/display/utils.py CHANGED
@@ -4,7 +4,7 @@ from enum import Enum
4
  import pandas as pd
5
 
6
  # changes to be made here
7
- from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns
8
 
9
 
10
  def fields(raw_class):
@@ -26,7 +26,9 @@ class ColumnContent:
26
  dataset_task_col: bool = False
27
  open_ended_col: bool = False
28
  med_safety_col: bool = False
29
- cross_examination_col: bool = False
 
 
30
 
31
 
32
  ## Leaderboard columns
@@ -44,6 +46,12 @@ for column in OpenEndedColumns:
44
  # changes to be made here
45
  for column in MedSafetyColumns:
46
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, med_safety_col=True, invariant=False)])
 
 
 
 
 
 
47
  auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
48
  auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
49
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
@@ -74,6 +82,8 @@ class EvalQueueColumn: # Queue column
74
  closed_ended_status = ColumnContent("closed_ended_status", "str", True)
75
  open_ended_status = ColumnContent("open_ended_status", "str", True)
76
  med_safety_status = ColumnContent("med_safety_status", "str", True)
 
 
77
 
78
  ## All the model information that we might need
79
  @dataclass
@@ -196,7 +206,10 @@ class EvaluationMetrics(Enum):
196
  DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.dataset_task_col or c.invariant)]
197
  OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_col or c.invariant)]
198
  MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.med_safety_col or c.invariant)]
199
- CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.cross_examination_col or c.invariant)]
 
 
 
200
  # DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
201
  # OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
202
  # MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.dataset_task_col and not c.cross_examination_col]
@@ -213,6 +226,9 @@ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
213
  DATASET_BENCHMARK_COLS = [t.value.col_name for t in HarnessTasks]
214
  OPEN_ENDED_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedColumns]
215
  MED_SAFETY_BENCHMARK_COLS = [t.value.col_name for t in MedSafetyColumns]
 
 
 
216
  # CROSS_EXAMINATION_BENCHMARK_COLS = [t.value.col_name for t in CrossExaminationTasks]
217
 
218
  NUMERIC_INTERVALS = {
 
4
  import pandas as pd
5
 
6
  # changes to be made here
7
+ from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns
8
 
9
 
10
  def fields(raw_class):
 
26
  dataset_task_col: bool = False
27
  open_ended_col: bool = False
28
  med_safety_col: bool = False
29
+ medical_summarization_col: bool = False
30
+ aci_col: bool = False
31
+ soap_col: bool = False
32
 
33
 
34
  ## Leaderboard columns
 
46
  # changes to be made here
47
  for column in MedSafetyColumns:
48
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, med_safety_col=True, invariant=False)])
49
+ for column in MedicalSummarizationColumns:
50
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, medical_summarization_col=True, invariant=False)])
51
+ for column in ACIColumns:
52
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, aci_col=True, invariant=False)])
53
+ for column in SOAPColumns:
54
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, soap_col=True, invariant=False)])
55
  auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
56
  auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
57
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
 
82
  closed_ended_status = ColumnContent("closed_ended_status", "str", True)
83
  open_ended_status = ColumnContent("open_ended_status", "str", True)
84
  med_safety_status = ColumnContent("med_safety_status", "str", True)
85
+ medical_summarization_status = ColumnContent("medical_summarization_status", "str", True)
86
+ note_generation_status = ColumnContent("note_generation_status", "str", True)
87
 
88
  ## All the model information that we might need
89
  @dataclass
 
206
  DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.dataset_task_col or c.invariant)]
207
  OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_col or c.invariant)]
208
  MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.med_safety_col or c.invariant)]
209
+ MEDICAL_SUMMARIZATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.medical_summarization_col or c.invariant)]
210
+ ACI_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.aci_col or c.invariant)]
211
+ SOAP_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.soap_col or c.invariant)]
212
+ # CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.cross_examination_col or c.invariant)]
213
  # DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
214
  # OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
215
  # MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.dataset_task_col and not c.cross_examination_col]
 
226
  DATASET_BENCHMARK_COLS = [t.value.col_name for t in HarnessTasks]
227
  OPEN_ENDED_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedColumns]
228
  MED_SAFETY_BENCHMARK_COLS = [t.value.col_name for t in MedSafetyColumns]
229
+ MEDICAL_SUMMARIZATION_BENCHMARK_COLS = [t.value.col_name for t in MedicalSummarizationColumns]
230
+ ACI_BENCHMARK_COLS = [t.value.col_name for t in ACIColumns]
231
+ SOAP_BENCHMARK_COLS = [t.value.col_name for t in SOAPColumns]
232
  # CROSS_EXAMINATION_BENCHMARK_COLS = [t.value.col_name for t in CrossExaminationTasks]
233
 
234
  NUMERIC_INTERVALS = {
src/leaderboard/read_evals.py CHANGED
@@ -9,7 +9,7 @@ import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
  # changes to be made here
12
- from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns
13
  from src.submission.check_validity import is_model_on_hub
14
 
15
 
@@ -26,7 +26,9 @@ class EvalResult:
26
  # changes to be made here
27
  open_ended_results: dict
28
  med_safety_results: dict
29
- cross_examination_results: dict
 
 
30
  is_domain_specific: bool
31
  use_chat_template: bool
32
  # clinical_type_results:dict
@@ -123,7 +125,42 @@ class EvalResult:
123
  continue
124
  mean_acc = np.mean(accs) # * 100.0
125
  med_safety_results[task.benchmark] = mean_acc
126
- cross_examination_results = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  # types_results = {}
128
  # for clinical_type in ClinicalTypes:
129
  # clinical_type = clinical_type.value
@@ -145,7 +182,9 @@ class EvalResult:
145
  dataset_results=harness_results,
146
  open_ended_results=open_ended_results,
147
  med_safety_results=med_safety_results,
148
- cross_examination_results=cross_examination_results,
 
 
149
  is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
150
  use_chat_template=config.get("use_chat_template", False), # Assuming a default value
151
  precision=precision,
@@ -224,12 +263,22 @@ class EvalResult:
224
  for task in MedSafetyColumns:
225
  data_dict[task.value.col_name] = self.med_safety_results[task.value.benchmark]
226
  return data_dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
- # if subset == "cross_examination":
229
- # if len(self.cross_examination_results) > 0:
230
- # for task in CrossExaminationTasks:
231
- # data_dict[task.value.col_name] = self.cross_examination_results[task.value.benchmark]
232
- # return data_dict
233
 
234
  def get_request_file_for_model(requests_path, model_name, precision):
235
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
 
9
 
10
  from src.display.formatting import make_clickable_model
11
  # changes to be made here
12
+ from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns
13
  from src.submission.check_validity import is_model_on_hub
14
 
15
 
 
26
  # changes to be made here
27
  open_ended_results: dict
28
  med_safety_results: dict
29
+ medical_summarization_results: dict
30
+ aci_results: dict
31
+ soap_results: dict
32
  is_domain_specific: bool
33
  use_chat_template: bool
34
  # clinical_type_results:dict
 
125
  continue
126
  mean_acc = np.mean(accs) # * 100.0
127
  med_safety_results[task.benchmark] = mean_acc
128
+ medical_summarization_results = {}
129
+ if "medical-summarization" in data["results"]:
130
+ for task in MedicalSummarizationColumns:
131
+ task = task.value
132
+ try:
133
+ accs = np.array([v for k, v in data["results"]["medical-summarization"]["clinical_trial"].items() if task.benchmark == k])
134
+ except:
135
+ accs = np.array([])
136
+ if accs.size == 0 or any([acc is None for acc in accs]):
137
+ continue
138
+ mean_acc = np.mean(accs) # * 100.0
139
+ medical_summarization_results[task.benchmark] = mean_acc
140
+ aci_results = {}
141
+ if "note-generation" in data["results"] and "aci" in data["results"]["note-generation"]:
142
+ for task in ACIColumns:
143
+ task = task.value
144
+ try:
145
+ accs = np.array([v for k, v in data["results"]["note-generation"]["aci"].items() if task.benchmark == k])
146
+ except:
147
+ accs = np.array([])
148
+ if accs.size == 0 or any([acc is None for acc in accs]):
149
+ continue
150
+ mean_acc = np.mean(accs) # * 100.0
151
+ aci_results[task.benchmark] = mean_acc
152
+ soap_results = {}
153
+ if "note-generation" in data["results"] and "soap" in data["results"]["note-generation"]:
154
+ for task in SOAPColumns:
155
+ task = task.value
156
+ try:
157
+ accs = np.array([v for k, v in data["results"]["note-generation"]["soap"].items() if task.benchmark == k])
158
+ except:
159
+ accs = np.array([])
160
+ if accs.size == 0 or any([acc is None for acc in accs]):
161
+ continue
162
+ mean_acc = np.mean(accs) # * 100.0
163
+ soap_results[task.benchmark] = mean_acc
164
  # types_results = {}
165
  # for clinical_type in ClinicalTypes:
166
  # clinical_type = clinical_type.value
 
182
  dataset_results=harness_results,
183
  open_ended_results=open_ended_results,
184
  med_safety_results=med_safety_results,
185
+ medical_summarization_results=medical_summarization_results,
186
+ aci_results=aci_results,
187
+ soap_results=soap_results,
188
  is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
189
  use_chat_template=config.get("use_chat_template", False), # Assuming a default value
190
  precision=precision,
 
263
  for task in MedSafetyColumns:
264
  data_dict[task.value.col_name] = self.med_safety_results[task.value.benchmark]
265
  return data_dict
266
+ if subset == "medical_summarization":
267
+ if len(self.medical_summarization_results) > 0:
268
+ for task in MedicalSummarizationColumns:
269
+ data_dict[task.value.col_name] = self.medical_summarization_results[task.value.benchmark]
270
+ return data_dict
271
+ if subset == "aci":
272
+ if len(self.aci_results) > 0:
273
+ for task in ACIColumns:
274
+ data_dict[task.value.col_name] = self.aci_results[task.value.benchmark]
275
+ return data_dict
276
+ if subset == "soap":
277
+ if len(self.soap_results) > 0:
278
+ for task in SOAPColumns:
279
+ data_dict[task.value.col_name] = self.soap_results[task.value.benchmark]
280
+ return data_dict
281
 
 
 
 
 
 
282
 
283
  def get_request_file_for_model(requests_path, model_name, precision):
284
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
src/populate.py CHANGED
@@ -5,7 +5,7 @@ import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  # changes to be made here
8
- from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
@@ -24,6 +24,12 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
24
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=True)
25
  elif subset == "open_ended":
26
  df = df.sort_values(by=["ELO"], ascending=False)
 
 
 
 
 
 
27
  cols = list(set(df.columns).intersection(set(cols)))
28
  df = df[cols].round(decimals=2)
29
  # filter out if any of the benchmarks have not been produced
@@ -46,6 +52,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
46
  data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
47
  data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
48
  data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
 
 
49
  all_evals.append(data)
50
  elif ".md" not in entry:
51
  # this is a folder
@@ -60,6 +68,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
60
  data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
61
  data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
62
  data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
 
 
63
  all_evals.append(data)
64
  # breakpoint()
65
  pending_list = []
@@ -67,8 +77,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
67
  finished_list = []
68
  for run in all_evals:
69
  # changes to be made here
70
- status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["cross-examination"]]
71
- status_list = status_list[:3]
72
  if "RUNNING" in status_list:
73
  running_list.append(run)
74
  elif "PENDING" in status_list or "RERUN" in status_list:
 
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  # changes to be made here
8
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
 
24
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=True)
25
  elif subset == "open_ended":
26
  df = df.sort_values(by=["ELO"], ascending=False)
27
+ elif subset == "medical_summarization":
28
+ df = df.sort_values(by=["Coverage Score"], ascending=False)
29
+ elif subset == "aci":
30
+ df = df.sort_values(by=["Coverage Score"], ascending=False)
31
+ elif subset == "soap":
32
+ df = df.sort_values(by=["Coverage Score"], ascending=False)
33
  cols = list(set(df.columns).intersection(set(cols)))
34
  df = df[cols].round(decimals=2)
35
  # filter out if any of the benchmarks have not been produced
 
52
  data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
53
  data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
54
  data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
55
+ data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
56
+ data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
57
  all_evals.append(data)
58
  elif ".md" not in entry:
59
  # this is a folder
 
68
  data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
69
  data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
70
  data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
71
+ data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
72
+ data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
73
  all_evals.append(data)
74
  # breakpoint()
75
  pending_list = []
 
77
  finished_list = []
78
  for run in all_evals:
79
  # changes to be made here
80
+ status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["medical-summarization"], run["status"]["note-generation"]]
81
+ # status_list = status_list
82
  if "RUNNING" in status_list:
83
  running_list.append(run)
84
  elif "PENDING" in status_list or "RERUN" in status_list: