Spaces:
Running
Running
Commit
·
553b217
1
Parent(s):
6c10fa6
[ADD] Cross-examination framework
Browse files- app.py +357 -7
- src/about.py +36 -0
- src/display/utils.py +19 -3
- src/leaderboard/read_evals.py +58 -9
- src/populate.py +13 -3
app.py
CHANGED
@@ -24,9 +24,15 @@ from src.display.utils import (
|
|
24 |
DATASET_BENCHMARK_COLS,
|
25 |
OPEN_ENDED_BENCHMARK_COLS,
|
26 |
MED_SAFETY_BENCHMARK_COLS,
|
|
|
|
|
|
|
27 |
DATASET_COLS,
|
28 |
OPEN_ENDED_COLS,
|
29 |
MED_SAFETY_COLS,
|
|
|
|
|
|
|
30 |
EVAL_COLS,
|
31 |
EVAL_TYPES,
|
32 |
NUMERIC_INTERVALS,
|
@@ -75,7 +81,15 @@ open_ended_leaderboard_df = open_ended_original_df.copy()
|
|
75 |
_, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
|
76 |
med_safety_leaderboard_df = med_safety_original_df.copy()
|
77 |
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
# breakpoint()
|
80 |
# # Token based results
|
81 |
# _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
|
@@ -91,7 +105,7 @@ med_safety_leaderboard_df = med_safety_original_df.copy()
|
|
91 |
pending_eval_queue_df,
|
92 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
93 |
|
94 |
-
|
95 |
def update_df(shown_columns, subset="datasets"):
|
96 |
# changes to be made here
|
97 |
if subset == "datasets":
|
@@ -103,6 +117,15 @@ def update_df(shown_columns, subset="datasets"):
|
|
103 |
elif subset == "med_safety":
|
104 |
leaderboard_table_df = med_safety_leaderboard_df.copy()
|
105 |
hidden_leader_board_df = med_safety_original_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
# else:
|
107 |
# match evaluation_metric:
|
108 |
# case "Span Based":
|
@@ -217,6 +240,7 @@ def filter_models(
|
|
217 |
|
218 |
demo = gr.Blocks(css=custom_css)
|
219 |
with demo:
|
|
|
220 |
gr.HTML(TITLE)
|
221 |
gr.HTML(LOGO, elem_classes="logo")
|
222 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
@@ -555,18 +579,344 @@ with demo:
|
|
555 |
queue=True,
|
556 |
)
|
557 |
|
558 |
-
with gr.TabItem("🏅
|
559 |
-
gr.
|
560 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
561 |
|
562 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
563 |
gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
|
564 |
# gr.HTML(EVALUATION_EXAMPLE_IMG, elem_classes="logo")
|
565 |
# gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
|
566 |
# gr.HTML(ENTITY_DISTRIBUTION_IMG, elem_classes="logo")
|
567 |
# gr.Markdown(LLM_BENCHMARKS_TEXT_3, elem_classes="markdown-text")
|
568 |
|
569 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=
|
570 |
with gr.Column():
|
571 |
with gr.Row():
|
572 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
24 |
DATASET_BENCHMARK_COLS,
|
25 |
OPEN_ENDED_BENCHMARK_COLS,
|
26 |
MED_SAFETY_BENCHMARK_COLS,
|
27 |
+
MEDICAL_SUMMARIZATION_BENCHMARK_COLS,
|
28 |
+
ACI_BENCHMARK_COLS,
|
29 |
+
SOAP_BENCHMARK_COLS,
|
30 |
DATASET_COLS,
|
31 |
OPEN_ENDED_COLS,
|
32 |
MED_SAFETY_COLS,
|
33 |
+
MEDICAL_SUMMARIZATION_COLS,
|
34 |
+
ACI_COLS,
|
35 |
+
SOAP_COLS,
|
36 |
EVAL_COLS,
|
37 |
EVAL_TYPES,
|
38 |
NUMERIC_INTERVALS,
|
|
|
81 |
_, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
|
82 |
med_safety_leaderboard_df = med_safety_original_df.copy()
|
83 |
|
84 |
+
_, medical_summarization_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDICAL_SUMMARIZATION_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, "score", "medical_summarization")
|
85 |
+
medical_summarization_leaderboard_df = medical_summarization_original_df.copy()
|
86 |
+
|
87 |
+
_, aci_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ACI_COLS, ACI_BENCHMARK_COLS, "score", "aci")
|
88 |
+
aci_leaderboard_df = aci_original_df.copy()
|
89 |
+
|
90 |
+
_, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
|
91 |
+
soap_leaderboard_df = soap_original_df.copy()
|
92 |
+
|
93 |
# breakpoint()
|
94 |
# # Token based results
|
95 |
# _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
|
|
|
105 |
pending_eval_queue_df,
|
106 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
107 |
|
108 |
+
# breakpoint()
|
109 |
def update_df(shown_columns, subset="datasets"):
|
110 |
# changes to be made here
|
111 |
if subset == "datasets":
|
|
|
117 |
elif subset == "med_safety":
|
118 |
leaderboard_table_df = med_safety_leaderboard_df.copy()
|
119 |
hidden_leader_board_df = med_safety_original_df
|
120 |
+
elif subset == "medical_summarization":
|
121 |
+
leaderboard_table_df = medical_summarization_leaderboard_df.copy()
|
122 |
+
hidden_leader_board_df = medical_summarization_original_df
|
123 |
+
elif subset == "aci":
|
124 |
+
leaderboard_table_df = aci_leaderboard_df.copy()
|
125 |
+
hidden_leader_board_df = aci_original_df
|
126 |
+
elif subset == "soap":
|
127 |
+
leaderboard_table_df = soap_leaderboard_df.copy()
|
128 |
+
hidden_leader_board_df = soap_original_df
|
129 |
# else:
|
130 |
# match evaluation_metric:
|
131 |
# case "Span Based":
|
|
|
240 |
|
241 |
demo = gr.Blocks(css=custom_css)
|
242 |
with demo:
|
243 |
+
print("hello")
|
244 |
gr.HTML(TITLE)
|
245 |
gr.HTML(LOGO, elem_classes="logo")
|
246 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
|
|
579 |
queue=True,
|
580 |
)
|
581 |
|
582 |
+
with gr.TabItem("🏅 Medical Summarization", elem_id="llm-benchmark-tab-table", id=3):
|
583 |
+
with gr.Row():
|
584 |
+
with gr.Column():
|
585 |
+
with gr.Row():
|
586 |
+
search_bar = gr.Textbox(
|
587 |
+
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
588 |
+
show_label=False,
|
589 |
+
elem_id="search-bar",
|
590 |
+
)
|
591 |
+
with gr.Row():
|
592 |
+
shown_columns = gr.CheckboxGroup(
|
593 |
+
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)],
|
594 |
+
value=[
|
595 |
+
c.name
|
596 |
+
for c in fields(AutoEvalColumn)
|
597 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)
|
598 |
+
],
|
599 |
+
label="Select columns to show",
|
600 |
+
elem_id="column-select",
|
601 |
+
interactive=True,
|
602 |
+
)
|
603 |
+
# with gr.Row():
|
604 |
+
# deleted_models_visibility = gr.Checkbox(
|
605 |
+
# value=False, label="Show gated/private/deleted models", interactive=True
|
606 |
+
# )
|
607 |
+
with gr.Column(min_width=320):
|
608 |
+
# with gr.Box(elem_id="box-filter"):
|
609 |
+
filter_columns_type = gr.CheckboxGroup(
|
610 |
+
label="Model Types",
|
611 |
+
choices=[t.to_str() for t in ModelType],
|
612 |
+
value=[t.to_str() for t in ModelType],
|
613 |
+
interactive=True,
|
614 |
+
elem_id="filter-columns-type",
|
615 |
+
)
|
616 |
+
# filter_columns_architecture = gr.CheckboxGroup(
|
617 |
+
# label="Architecture Types",
|
618 |
+
# choices=[i.value.name for i in ModelArch],
|
619 |
+
# value=[i.value.name for i in ModelArch],
|
620 |
+
# interactive=True,
|
621 |
+
# elem_id="filter-columns-architecture",
|
622 |
+
# )
|
623 |
+
filter_domain_specific = gr.CheckboxGroup(
|
624 |
+
label="Domain specific models",
|
625 |
+
choices=["Yes", "No"],
|
626 |
+
value=["Yes", "No"],
|
627 |
+
interactive=True,
|
628 |
+
elem_id="filter-columns-type",
|
629 |
+
)
|
630 |
+
filter_columns_size = gr.CheckboxGroup(
|
631 |
+
label="Model sizes (in billions of parameters)",
|
632 |
+
choices=list(NUMERIC_INTERVALS.keys()),
|
633 |
+
value=list(NUMERIC_INTERVALS.keys()),
|
634 |
+
interactive=True,
|
635 |
+
elem_id="filter-columns-size",
|
636 |
+
)
|
637 |
+
|
638 |
+
datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="medical_summarization")
|
639 |
+
|
640 |
+
leaderboard_table = gr.components.Dataframe(
|
641 |
+
value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
642 |
+
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
643 |
+
datatype=TYPES,
|
644 |
+
elem_id="leaderboard-table",
|
645 |
+
interactive=False,
|
646 |
+
visible=True,
|
647 |
+
)
|
648 |
|
649 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
650 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
651 |
+
value=datasets_original_df[MEDICAL_SUMMARIZATION_COLS],
|
652 |
+
headers=MEDICAL_SUMMARIZATION_COLS,
|
653 |
+
datatype=TYPES,
|
654 |
+
visible=False,
|
655 |
+
)
|
656 |
+
|
657 |
+
|
658 |
+
search_bar.submit(
|
659 |
+
update_table,
|
660 |
+
[
|
661 |
+
hidden_leaderboard_table_for_search,
|
662 |
+
shown_columns,
|
663 |
+
search_bar,
|
664 |
+
filter_columns_type,
|
665 |
+
filter_domain_specific,
|
666 |
+
filter_columns_size
|
667 |
+
# filter_columns_architecture
|
668 |
+
],
|
669 |
+
leaderboard_table,
|
670 |
+
)
|
671 |
+
for selector in [
|
672 |
+
shown_columns,
|
673 |
+
filter_columns_type,
|
674 |
+
filter_domain_specific,
|
675 |
+
filter_columns_size,
|
676 |
+
# deleted_models_visibility,
|
677 |
+
]:
|
678 |
+
selector.change(
|
679 |
+
update_table,
|
680 |
+
[
|
681 |
+
hidden_leaderboard_table_for_search,
|
682 |
+
shown_columns,
|
683 |
+
search_bar,
|
684 |
+
filter_columns_type,
|
685 |
+
filter_domain_specific,
|
686 |
+
filter_columns_size
|
687 |
+
],
|
688 |
+
leaderboard_table,
|
689 |
+
queue=True,
|
690 |
+
)
|
691 |
+
with gr.TabItem("🏅 Note generation", elem_id="llm-benchmark-tab-table", id=4):
|
692 |
+
with gr.Tabs(elem_classes="tab-buttons2") as tabs:
|
693 |
+
with gr.TabItem("ACI Bench", elem_id="llm-benchmark-tab-table2", id=0):
|
694 |
+
with gr.Row():
|
695 |
+
with gr.Column():
|
696 |
+
with gr.Row():
|
697 |
+
search_bar = gr.Textbox(
|
698 |
+
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
699 |
+
show_label=False,
|
700 |
+
elem_id="search-bar",
|
701 |
+
)
|
702 |
+
with gr.Row():
|
703 |
+
shown_columns = gr.CheckboxGroup(
|
704 |
+
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)],
|
705 |
+
value=[
|
706 |
+
c.name
|
707 |
+
for c in fields(AutoEvalColumn)
|
708 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)
|
709 |
+
],
|
710 |
+
label="Select columns to show",
|
711 |
+
elem_id="column-select",
|
712 |
+
interactive=True,
|
713 |
+
)
|
714 |
+
# with gr.Row():
|
715 |
+
# deleted_models_visibility = gr.Checkbox(
|
716 |
+
# value=False, label="Show gated/private/deleted models", interactive=True
|
717 |
+
# )
|
718 |
+
with gr.Column(min_width=320):
|
719 |
+
# with gr.Box(elem_id="box-filter"):
|
720 |
+
filter_columns_type = gr.CheckboxGroup(
|
721 |
+
label="Model Types",
|
722 |
+
choices=[t.to_str() for t in ModelType],
|
723 |
+
value=[t.to_str() for t in ModelType],
|
724 |
+
interactive=True,
|
725 |
+
elem_id="filter-columns-type",
|
726 |
+
)
|
727 |
+
# filter_columns_architecture = gr.CheckboxGroup(
|
728 |
+
# label="Architecture Types",
|
729 |
+
# choices=[i.value.name for i in ModelArch],
|
730 |
+
# value=[i.value.name for i in ModelArch],
|
731 |
+
# interactive=True,
|
732 |
+
# elem_id="filter-columns-architecture",
|
733 |
+
# )
|
734 |
+
filter_domain_specific = gr.CheckboxGroup(
|
735 |
+
label="Domain specific models",
|
736 |
+
choices=["Yes", "No"],
|
737 |
+
value=["Yes", "No"],
|
738 |
+
interactive=True,
|
739 |
+
elem_id="filter-columns-type",
|
740 |
+
)
|
741 |
+
filter_columns_size = gr.CheckboxGroup(
|
742 |
+
label="Model sizes (in billions of parameters)",
|
743 |
+
choices=list(NUMERIC_INTERVALS.keys()),
|
744 |
+
value=list(NUMERIC_INTERVALS.keys()),
|
745 |
+
interactive=True,
|
746 |
+
elem_id="filter-columns-size",
|
747 |
+
)
|
748 |
+
|
749 |
+
datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="aci")
|
750 |
+
|
751 |
+
leaderboard_table = gr.components.Dataframe(
|
752 |
+
value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
753 |
+
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
754 |
+
datatype=TYPES,
|
755 |
+
elem_id="leaderboard-table",
|
756 |
+
interactive=False,
|
757 |
+
visible=True,
|
758 |
+
)
|
759 |
+
|
760 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
761 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
762 |
+
value=datasets_original_df[ACI_COLS],
|
763 |
+
headers=ACI_COLS,
|
764 |
+
datatype=TYPES,
|
765 |
+
visible=False,
|
766 |
+
)
|
767 |
+
|
768 |
+
|
769 |
+
search_bar.submit(
|
770 |
+
update_table,
|
771 |
+
[
|
772 |
+
hidden_leaderboard_table_for_search,
|
773 |
+
shown_columns,
|
774 |
+
search_bar,
|
775 |
+
filter_columns_type,
|
776 |
+
filter_domain_specific,
|
777 |
+
filter_columns_size
|
778 |
+
# filter_columns_architecture
|
779 |
+
],
|
780 |
+
leaderboard_table,
|
781 |
+
)
|
782 |
+
for selector in [
|
783 |
+
shown_columns,
|
784 |
+
filter_columns_type,
|
785 |
+
filter_domain_specific,
|
786 |
+
filter_columns_size,
|
787 |
+
# deleted_models_visibility,
|
788 |
+
]:
|
789 |
+
selector.change(
|
790 |
+
update_table,
|
791 |
+
[
|
792 |
+
hidden_leaderboard_table_for_search,
|
793 |
+
shown_columns,
|
794 |
+
search_bar,
|
795 |
+
filter_columns_type,
|
796 |
+
filter_domain_specific,
|
797 |
+
filter_columns_size
|
798 |
+
],
|
799 |
+
leaderboard_table,
|
800 |
+
queue=True,
|
801 |
+
)
|
802 |
+
|
803 |
+
with gr.TabItem("SOAP Notes", elem_id="llm-benchmark-tab-table2", id=1):
|
804 |
+
with gr.Row():
|
805 |
+
with gr.Column():
|
806 |
+
with gr.Row():
|
807 |
+
search_bar = gr.Textbox(
|
808 |
+
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
809 |
+
show_label=False,
|
810 |
+
elem_id="search-bar",
|
811 |
+
)
|
812 |
+
with gr.Row():
|
813 |
+
shown_columns = gr.CheckboxGroup(
|
814 |
+
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)],
|
815 |
+
value=[
|
816 |
+
c.name
|
817 |
+
for c in fields(AutoEvalColumn)
|
818 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)
|
819 |
+
],
|
820 |
+
label="Select columns to show",
|
821 |
+
elem_id="column-select",
|
822 |
+
interactive=True,
|
823 |
+
)
|
824 |
+
# with gr.Row():
|
825 |
+
# deleted_models_visibility = gr.Checkbox(
|
826 |
+
# value=False, label="Show gated/private/deleted models", interactive=True
|
827 |
+
# )
|
828 |
+
with gr.Column(min_width=320):
|
829 |
+
# with gr.Box(elem_id="box-filter"):
|
830 |
+
filter_columns_type = gr.CheckboxGroup(
|
831 |
+
label="Model Types",
|
832 |
+
choices=[t.to_str() for t in ModelType],
|
833 |
+
value=[t.to_str() for t in ModelType],
|
834 |
+
interactive=True,
|
835 |
+
elem_id="filter-columns-type",
|
836 |
+
)
|
837 |
+
# filter_columns_architecture = gr.CheckboxGroup(
|
838 |
+
# label="Architecture Types",
|
839 |
+
# choices=[i.value.name for i in ModelArch],
|
840 |
+
# value=[i.value.name for i in ModelArch],
|
841 |
+
# interactive=True,
|
842 |
+
# elem_id="filter-columns-architecture",
|
843 |
+
# )
|
844 |
+
filter_domain_specific = gr.CheckboxGroup(
|
845 |
+
label="Domain specific models",
|
846 |
+
choices=["Yes", "No"],
|
847 |
+
value=["Yes", "No"],
|
848 |
+
interactive=True,
|
849 |
+
elem_id="filter-columns-type",
|
850 |
+
)
|
851 |
+
filter_columns_size = gr.CheckboxGroup(
|
852 |
+
label="Model sizes (in billions of parameters)",
|
853 |
+
choices=list(NUMERIC_INTERVALS.keys()),
|
854 |
+
value=list(NUMERIC_INTERVALS.keys()),
|
855 |
+
interactive=True,
|
856 |
+
elem_id="filter-columns-size",
|
857 |
+
)
|
858 |
+
|
859 |
+
datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="soap")
|
860 |
+
|
861 |
+
leaderboard_table = gr.components.Dataframe(
|
862 |
+
value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
863 |
+
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
864 |
+
datatype=TYPES,
|
865 |
+
elem_id="leaderboard-table",
|
866 |
+
interactive=False,
|
867 |
+
visible=True,
|
868 |
+
)
|
869 |
+
|
870 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
871 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
872 |
+
value=datasets_original_df[SOAP_COLS],
|
873 |
+
headers=SOAP_COLS,
|
874 |
+
datatype=TYPES,
|
875 |
+
visible=False,
|
876 |
+
)
|
877 |
+
|
878 |
+
|
879 |
+
search_bar.submit(
|
880 |
+
update_table,
|
881 |
+
[
|
882 |
+
hidden_leaderboard_table_for_search,
|
883 |
+
shown_columns,
|
884 |
+
search_bar,
|
885 |
+
filter_columns_type,
|
886 |
+
filter_domain_specific,
|
887 |
+
filter_columns_size
|
888 |
+
# filter_columns_architecture
|
889 |
+
],
|
890 |
+
leaderboard_table,
|
891 |
+
)
|
892 |
+
for selector in [
|
893 |
+
shown_columns,
|
894 |
+
filter_columns_type,
|
895 |
+
filter_domain_specific,
|
896 |
+
filter_columns_size,
|
897 |
+
# deleted_models_visibility,
|
898 |
+
]:
|
899 |
+
selector.change(
|
900 |
+
update_table,
|
901 |
+
[
|
902 |
+
hidden_leaderboard_table_for_search,
|
903 |
+
shown_columns,
|
904 |
+
search_bar,
|
905 |
+
filter_columns_type,
|
906 |
+
filter_domain_specific,
|
907 |
+
filter_columns_size
|
908 |
+
],
|
909 |
+
leaderboard_table,
|
910 |
+
queue=True,
|
911 |
+
)
|
912 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=5):
|
913 |
gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
|
914 |
# gr.HTML(EVALUATION_EXAMPLE_IMG, elem_classes="logo")
|
915 |
# gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
|
916 |
# gr.HTML(ENTITY_DISTRIBUTION_IMG, elem_classes="logo")
|
917 |
# gr.Markdown(LLM_BENCHMARKS_TEXT_3, elem_classes="markdown-text")
|
918 |
|
919 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=6):
|
920 |
with gr.Column():
|
921 |
with gr.Row():
|
922 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
src/about.py
CHANGED
@@ -57,6 +57,42 @@ class MedSafetyColumns(Enum):
|
|
57 |
med_safety_column7 = MedSafetyColumn("Responsibility to Patient", "score", "Responsibility to Patient")
|
58 |
med_safety_column8 = MedSafetyColumn("Law and Responsibility to Society", "score", "Law and Responsibility to Society")
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
NUM_FEWSHOT = 0 # Change with your few shot
|
61 |
# ---------------------------------------------------
|
62 |
|
|
|
57 |
med_safety_column7 = MedSafetyColumn("Responsibility to Patient", "score", "Responsibility to Patient")
|
58 |
med_safety_column8 = MedSafetyColumn("Law and Responsibility to Society", "score", "Law and Responsibility to Society")
|
59 |
|
60 |
+
@dataclass
|
61 |
+
class MedicalSummarizationColumn:
|
62 |
+
benchmark: str
|
63 |
+
metric: str
|
64 |
+
col_name: str
|
65 |
+
|
66 |
+
class MedicalSummarizationColumns(Enum):
|
67 |
+
medical_summarization_column0 = MedicalSummarizationColumn("brief", "score", "Brief Score")
|
68 |
+
medical_summarization_column1 = MedicalSummarizationColumn("coverage", "score", "Coverage Score")
|
69 |
+
medical_summarization_column2 = MedicalSummarizationColumn("conform", "score", "Conform Score")
|
70 |
+
medical_summarization_column3 = MedicalSummarizationColumn("fact", "score", "Fact Score")
|
71 |
+
|
72 |
+
@dataclass
|
73 |
+
class ACIColumn:
|
74 |
+
benchmark: str
|
75 |
+
metric: str
|
76 |
+
col_name: str
|
77 |
+
|
78 |
+
class ACIColumns(Enum):
|
79 |
+
aci_column0 = ACIColumn("brief", "score", "Brief Score")
|
80 |
+
aci_column1 = ACIColumn("coverage", "score", "Coverage Score")
|
81 |
+
aci_column2 = ACIColumn("conform", "score", "Conform Score")
|
82 |
+
aci_column3 = ACIColumn("fact", "score", "Fact Score")
|
83 |
+
|
84 |
+
@dataclass
|
85 |
+
class SOAPColumn:
|
86 |
+
benchmark: str
|
87 |
+
metric: str
|
88 |
+
col_name: str
|
89 |
+
|
90 |
+
class SOAPColumns(Enum):
|
91 |
+
soap_column0 = SOAPColumn("brief", "score", "Brief Score")
|
92 |
+
soap_column1 = SOAPColumn("coverage", "score", "Coverage Score")
|
93 |
+
soap_column2 = SOAPColumn("conform", "score", "Conform Score")
|
94 |
+
soap_column3 = SOAPColumn("fact", "score", "Fact Score")
|
95 |
+
|
96 |
NUM_FEWSHOT = 0 # Change with your few shot
|
97 |
# ---------------------------------------------------
|
98 |
|
src/display/utils.py
CHANGED
@@ -4,7 +4,7 @@ from enum import Enum
|
|
4 |
import pandas as pd
|
5 |
|
6 |
# changes to be made here
|
7 |
-
from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns
|
8 |
|
9 |
|
10 |
def fields(raw_class):
|
@@ -26,7 +26,9 @@ class ColumnContent:
|
|
26 |
dataset_task_col: bool = False
|
27 |
open_ended_col: bool = False
|
28 |
med_safety_col: bool = False
|
29 |
-
|
|
|
|
|
30 |
|
31 |
|
32 |
## Leaderboard columns
|
@@ -44,6 +46,12 @@ for column in OpenEndedColumns:
|
|
44 |
# changes to be made here
|
45 |
for column in MedSafetyColumns:
|
46 |
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, med_safety_col=True, invariant=False)])
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
|
48 |
auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
|
49 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
@@ -74,6 +82,8 @@ class EvalQueueColumn: # Queue column
|
|
74 |
closed_ended_status = ColumnContent("closed_ended_status", "str", True)
|
75 |
open_ended_status = ColumnContent("open_ended_status", "str", True)
|
76 |
med_safety_status = ColumnContent("med_safety_status", "str", True)
|
|
|
|
|
77 |
|
78 |
## All the model information that we might need
|
79 |
@dataclass
|
@@ -196,7 +206,10 @@ class EvaluationMetrics(Enum):
|
|
196 |
DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.dataset_task_col or c.invariant)]
|
197 |
OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_col or c.invariant)]
|
198 |
MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.med_safety_col or c.invariant)]
|
199 |
-
|
|
|
|
|
|
|
200 |
# DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
|
201 |
# OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
|
202 |
# MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.dataset_task_col and not c.cross_examination_col]
|
@@ -213,6 +226,9 @@ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
|
213 |
DATASET_BENCHMARK_COLS = [t.value.col_name for t in HarnessTasks]
|
214 |
OPEN_ENDED_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedColumns]
|
215 |
MED_SAFETY_BENCHMARK_COLS = [t.value.col_name for t in MedSafetyColumns]
|
|
|
|
|
|
|
216 |
# CROSS_EXAMINATION_BENCHMARK_COLS = [t.value.col_name for t in CrossExaminationTasks]
|
217 |
|
218 |
NUMERIC_INTERVALS = {
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
# changes to be made here
|
7 |
+
from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns
|
8 |
|
9 |
|
10 |
def fields(raw_class):
|
|
|
26 |
dataset_task_col: bool = False
|
27 |
open_ended_col: bool = False
|
28 |
med_safety_col: bool = False
|
29 |
+
medical_summarization_col: bool = False
|
30 |
+
aci_col: bool = False
|
31 |
+
soap_col: bool = False
|
32 |
|
33 |
|
34 |
## Leaderboard columns
|
|
|
46 |
# changes to be made here
|
47 |
for column in MedSafetyColumns:
|
48 |
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, med_safety_col=True, invariant=False)])
|
49 |
+
for column in MedicalSummarizationColumns:
|
50 |
+
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, medical_summarization_col=True, invariant=False)])
|
51 |
+
for column in ACIColumns:
|
52 |
+
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, aci_col=True, invariant=False)])
|
53 |
+
for column in SOAPColumns:
|
54 |
+
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, soap_col=True, invariant=False)])
|
55 |
auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
|
56 |
auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
|
57 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
|
|
82 |
closed_ended_status = ColumnContent("closed_ended_status", "str", True)
|
83 |
open_ended_status = ColumnContent("open_ended_status", "str", True)
|
84 |
med_safety_status = ColumnContent("med_safety_status", "str", True)
|
85 |
+
medical_summarization_status = ColumnContent("medical_summarization_status", "str", True)
|
86 |
+
note_generation_status = ColumnContent("note_generation_status", "str", True)
|
87 |
|
88 |
## All the model information that we might need
|
89 |
@dataclass
|
|
|
206 |
DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.dataset_task_col or c.invariant)]
|
207 |
OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_col or c.invariant)]
|
208 |
MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.med_safety_col or c.invariant)]
|
209 |
+
MEDICAL_SUMMARIZATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.medical_summarization_col or c.invariant)]
|
210 |
+
ACI_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.aci_col or c.invariant)]
|
211 |
+
SOAP_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.soap_col or c.invariant)]
|
212 |
+
# CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.cross_examination_col or c.invariant)]
|
213 |
# DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
|
214 |
# OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
|
215 |
# MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.dataset_task_col and not c.cross_examination_col]
|
|
|
226 |
DATASET_BENCHMARK_COLS = [t.value.col_name for t in HarnessTasks]
|
227 |
OPEN_ENDED_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedColumns]
|
228 |
MED_SAFETY_BENCHMARK_COLS = [t.value.col_name for t in MedSafetyColumns]
|
229 |
+
MEDICAL_SUMMARIZATION_BENCHMARK_COLS = [t.value.col_name for t in MedicalSummarizationColumns]
|
230 |
+
ACI_BENCHMARK_COLS = [t.value.col_name for t in ACIColumns]
|
231 |
+
SOAP_BENCHMARK_COLS = [t.value.col_name for t in SOAPColumns]
|
232 |
# CROSS_EXAMINATION_BENCHMARK_COLS = [t.value.col_name for t in CrossExaminationTasks]
|
233 |
|
234 |
NUMERIC_INTERVALS = {
|
src/leaderboard/read_evals.py
CHANGED
@@ -9,7 +9,7 @@ import numpy as np
|
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
# changes to be made here
|
12 |
-
from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns
|
13 |
from src.submission.check_validity import is_model_on_hub
|
14 |
|
15 |
|
@@ -26,7 +26,9 @@ class EvalResult:
|
|
26 |
# changes to be made here
|
27 |
open_ended_results: dict
|
28 |
med_safety_results: dict
|
29 |
-
|
|
|
|
|
30 |
is_domain_specific: bool
|
31 |
use_chat_template: bool
|
32 |
# clinical_type_results:dict
|
@@ -123,7 +125,42 @@ class EvalResult:
|
|
123 |
continue
|
124 |
mean_acc = np.mean(accs) # * 100.0
|
125 |
med_safety_results[task.benchmark] = mean_acc
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
# types_results = {}
|
128 |
# for clinical_type in ClinicalTypes:
|
129 |
# clinical_type = clinical_type.value
|
@@ -145,7 +182,9 @@ class EvalResult:
|
|
145 |
dataset_results=harness_results,
|
146 |
open_ended_results=open_ended_results,
|
147 |
med_safety_results=med_safety_results,
|
148 |
-
|
|
|
|
|
149 |
is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
|
150 |
use_chat_template=config.get("use_chat_template", False), # Assuming a default value
|
151 |
precision=precision,
|
@@ -224,12 +263,22 @@ class EvalResult:
|
|
224 |
for task in MedSafetyColumns:
|
225 |
data_dict[task.value.col_name] = self.med_safety_results[task.value.benchmark]
|
226 |
return data_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
|
228 |
-
# if subset == "cross_examination":
|
229 |
-
# if len(self.cross_examination_results) > 0:
|
230 |
-
# for task in CrossExaminationTasks:
|
231 |
-
# data_dict[task.value.col_name] = self.cross_examination_results[task.value.benchmark]
|
232 |
-
# return data_dict
|
233 |
|
234 |
def get_request_file_for_model(requests_path, model_name, precision):
|
235 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
|
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
# changes to be made here
|
12 |
+
from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns
|
13 |
from src.submission.check_validity import is_model_on_hub
|
14 |
|
15 |
|
|
|
26 |
# changes to be made here
|
27 |
open_ended_results: dict
|
28 |
med_safety_results: dict
|
29 |
+
medical_summarization_results: dict
|
30 |
+
aci_results: dict
|
31 |
+
soap_results: dict
|
32 |
is_domain_specific: bool
|
33 |
use_chat_template: bool
|
34 |
# clinical_type_results:dict
|
|
|
125 |
continue
|
126 |
mean_acc = np.mean(accs) # * 100.0
|
127 |
med_safety_results[task.benchmark] = mean_acc
|
128 |
+
medical_summarization_results = {}
|
129 |
+
if "medical-summarization" in data["results"]:
|
130 |
+
for task in MedicalSummarizationColumns:
|
131 |
+
task = task.value
|
132 |
+
try:
|
133 |
+
accs = np.array([v for k, v in data["results"]["medical-summarization"]["clinical_trial"].items() if task.benchmark == k])
|
134 |
+
except:
|
135 |
+
accs = np.array([])
|
136 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
137 |
+
continue
|
138 |
+
mean_acc = np.mean(accs) # * 100.0
|
139 |
+
medical_summarization_results[task.benchmark] = mean_acc
|
140 |
+
aci_results = {}
|
141 |
+
if "note-generation" in data["results"] and "aci" in data["results"]["note-generation"]:
|
142 |
+
for task in ACIColumns:
|
143 |
+
task = task.value
|
144 |
+
try:
|
145 |
+
accs = np.array([v for k, v in data["results"]["note-generation"]["aci"].items() if task.benchmark == k])
|
146 |
+
except:
|
147 |
+
accs = np.array([])
|
148 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
149 |
+
continue
|
150 |
+
mean_acc = np.mean(accs) # * 100.0
|
151 |
+
aci_results[task.benchmark] = mean_acc
|
152 |
+
soap_results = {}
|
153 |
+
if "note-generation" in data["results"] and "soap" in data["results"]["note-generation"]:
|
154 |
+
for task in SOAPColumns:
|
155 |
+
task = task.value
|
156 |
+
try:
|
157 |
+
accs = np.array([v for k, v in data["results"]["note-generation"]["soap"].items() if task.benchmark == k])
|
158 |
+
except:
|
159 |
+
accs = np.array([])
|
160 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
161 |
+
continue
|
162 |
+
mean_acc = np.mean(accs) # * 100.0
|
163 |
+
soap_results[task.benchmark] = mean_acc
|
164 |
# types_results = {}
|
165 |
# for clinical_type in ClinicalTypes:
|
166 |
# clinical_type = clinical_type.value
|
|
|
182 |
dataset_results=harness_results,
|
183 |
open_ended_results=open_ended_results,
|
184 |
med_safety_results=med_safety_results,
|
185 |
+
medical_summarization_results=medical_summarization_results,
|
186 |
+
aci_results=aci_results,
|
187 |
+
soap_results=soap_results,
|
188 |
is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
|
189 |
use_chat_template=config.get("use_chat_template", False), # Assuming a default value
|
190 |
precision=precision,
|
|
|
263 |
for task in MedSafetyColumns:
|
264 |
data_dict[task.value.col_name] = self.med_safety_results[task.value.benchmark]
|
265 |
return data_dict
|
266 |
+
if subset == "medical_summarization":
|
267 |
+
if len(self.medical_summarization_results) > 0:
|
268 |
+
for task in MedicalSummarizationColumns:
|
269 |
+
data_dict[task.value.col_name] = self.medical_summarization_results[task.value.benchmark]
|
270 |
+
return data_dict
|
271 |
+
if subset == "aci":
|
272 |
+
if len(self.aci_results) > 0:
|
273 |
+
for task in ACIColumns:
|
274 |
+
data_dict[task.value.col_name] = self.aci_results[task.value.benchmark]
|
275 |
+
return data_dict
|
276 |
+
if subset == "soap":
|
277 |
+
if len(self.soap_results) > 0:
|
278 |
+
for task in SOAPColumns:
|
279 |
+
data_dict[task.value.col_name] = self.soap_results[task.value.benchmark]
|
280 |
+
return data_dict
|
281 |
|
|
|
|
|
|
|
|
|
|
|
282 |
|
283 |
def get_request_file_for_model(requests_path, model_name, precision):
|
284 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
src/populate.py
CHANGED
@@ -5,7 +5,7 @@ import pandas as pd
|
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
# changes to be made here
|
8 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
@@ -24,6 +24,12 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
24 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=True)
|
25 |
elif subset == "open_ended":
|
26 |
df = df.sort_values(by=["ELO"], ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
cols = list(set(df.columns).intersection(set(cols)))
|
28 |
df = df[cols].round(decimals=2)
|
29 |
# filter out if any of the benchmarks have not been produced
|
@@ -46,6 +52,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
46 |
data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
|
47 |
data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
|
48 |
data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
|
|
|
|
|
49 |
all_evals.append(data)
|
50 |
elif ".md" not in entry:
|
51 |
# this is a folder
|
@@ -60,6 +68,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
60 |
data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
|
61 |
data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
|
62 |
data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
|
|
|
|
|
63 |
all_evals.append(data)
|
64 |
# breakpoint()
|
65 |
pending_list = []
|
@@ -67,8 +77,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
67 |
finished_list = []
|
68 |
for run in all_evals:
|
69 |
# changes to be made here
|
70 |
-
status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["
|
71 |
-
status_list = status_list
|
72 |
if "RUNNING" in status_list:
|
73 |
running_list.append(run)
|
74 |
elif "PENDING" in status_list or "RERUN" in status_list:
|
|
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
# changes to be made here
|
8 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
|
|
24 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=True)
|
25 |
elif subset == "open_ended":
|
26 |
df = df.sort_values(by=["ELO"], ascending=False)
|
27 |
+
elif subset == "medical_summarization":
|
28 |
+
df = df.sort_values(by=["Coverage Score"], ascending=False)
|
29 |
+
elif subset == "aci":
|
30 |
+
df = df.sort_values(by=["Coverage Score"], ascending=False)
|
31 |
+
elif subset == "soap":
|
32 |
+
df = df.sort_values(by=["Coverage Score"], ascending=False)
|
33 |
cols = list(set(df.columns).intersection(set(cols)))
|
34 |
df = df[cols].round(decimals=2)
|
35 |
# filter out if any of the benchmarks have not been produced
|
|
|
52 |
data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
|
53 |
data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
|
54 |
data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
|
55 |
+
data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
|
56 |
+
data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
|
57 |
all_evals.append(data)
|
58 |
elif ".md" not in entry:
|
59 |
# this is a folder
|
|
|
68 |
data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
|
69 |
data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
|
70 |
data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
|
71 |
+
data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
|
72 |
+
data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
|
73 |
all_evals.append(data)
|
74 |
# breakpoint()
|
75 |
pending_list = []
|
|
|
77 |
finished_list = []
|
78 |
for run in all_evals:
|
79 |
# changes to be made here
|
80 |
+
status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["medical-summarization"], run["status"]["note-generation"]]
|
81 |
+
# status_list = status_list
|
82 |
if "RUNNING" in status_list:
|
83 |
running_list.append(run)
|
84 |
elif "PENDING" in status_list or "RERUN" in status_list:
|