Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -250,117 +250,6 @@ with demo:
|
|
250 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
251 |
|
252 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
253 |
-
with gr.TabItem("π
Closed Ended Evaluation", elem_id="llm-benchmark-tab-table", id=0):
|
254 |
-
with gr.Row():
|
255 |
-
with gr.Column():
|
256 |
-
with gr.Row():
|
257 |
-
search_bar = gr.Textbox(
|
258 |
-
placeholder=" π Search for your model (separate multiple queries with `;`) and press ENTER...",
|
259 |
-
show_label=False,
|
260 |
-
elem_id="search-bar",
|
261 |
-
)
|
262 |
-
with gr.Row():
|
263 |
-
shown_columns = gr.CheckboxGroup(
|
264 |
-
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
|
265 |
-
value=[
|
266 |
-
c.name
|
267 |
-
for c in fields(AutoEvalColumn)
|
268 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)
|
269 |
-
],
|
270 |
-
label="Select columns to show",
|
271 |
-
elem_id="column-select",
|
272 |
-
interactive=True,
|
273 |
-
)
|
274 |
-
# with gr.Row():
|
275 |
-
# deleted_models_visibility = gr.Checkbox(
|
276 |
-
# value=False, label="Show gated/private/deleted models", interactive=True
|
277 |
-
# )
|
278 |
-
with gr.Column(min_width=320):
|
279 |
-
# with gr.Box(elem_id="box-filter"):
|
280 |
-
filter_columns_type = gr.CheckboxGroup(
|
281 |
-
label="Model Types",
|
282 |
-
choices=[t.to_str() for t in ModelType],
|
283 |
-
value=[t.to_str() for t in ModelType],
|
284 |
-
interactive=True,
|
285 |
-
elem_id="filter-columns-type",
|
286 |
-
)
|
287 |
-
# filter_columns_architecture = gr.CheckboxGroup(
|
288 |
-
# label="Architecture Types",
|
289 |
-
# choices=[i.value.name for i in ModelArch],
|
290 |
-
# value=[i.value.name for i in ModelArch],
|
291 |
-
# interactive=True,
|
292 |
-
# elem_id="filter-columns-architecture",
|
293 |
-
# )
|
294 |
-
filter_domain_specific = gr.CheckboxGroup(
|
295 |
-
label="Domain Specificity",
|
296 |
-
choices=["π₯ Clinical models", "Generic models"],
|
297 |
-
value=["π₯ Clinical models", "Generic models"],
|
298 |
-
interactive=True,
|
299 |
-
elem_id="filter-columns-type",
|
300 |
-
)
|
301 |
-
filter_columns_size = gr.CheckboxGroup(
|
302 |
-
label="Model sizes (in billions of parameters)",
|
303 |
-
choices=list(NUMERIC_INTERVALS.keys()),
|
304 |
-
value=list(NUMERIC_INTERVALS.keys()),
|
305 |
-
interactive=True,
|
306 |
-
elem_id="filter-columns-size",
|
307 |
-
)
|
308 |
-
|
309 |
-
datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
|
310 |
-
|
311 |
-
leaderboard_table = gr.components.Dataframe(
|
312 |
-
value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
313 |
-
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
314 |
-
datatype=TYPES,
|
315 |
-
elem_id="leaderboard-table",
|
316 |
-
interactive=False,
|
317 |
-
visible=True,
|
318 |
-
)
|
319 |
-
|
320 |
-
# Dummy leaderboard for handling the case when the user uses backspace key
|
321 |
-
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
322 |
-
value=datasets_original_df[DATASET_COLS],
|
323 |
-
headers=DATASET_COLS,
|
324 |
-
datatype=TYPES,
|
325 |
-
visible=False,
|
326 |
-
)
|
327 |
-
|
328 |
-
|
329 |
-
search_bar.submit(
|
330 |
-
update_table,
|
331 |
-
[
|
332 |
-
hidden_leaderboard_table_for_search,
|
333 |
-
shown_columns,
|
334 |
-
search_bar,
|
335 |
-
filter_columns_type,
|
336 |
-
filter_domain_specific,
|
337 |
-
filter_columns_size
|
338 |
-
# filter_columns_architecture
|
339 |
-
],
|
340 |
-
leaderboard_table,
|
341 |
-
)
|
342 |
-
for selector in [
|
343 |
-
shown_columns,
|
344 |
-
filter_columns_type,
|
345 |
-
filter_domain_specific,
|
346 |
-
# filter_columns_architecture,
|
347 |
-
filter_columns_size,
|
348 |
-
# deleted_models_visibility,
|
349 |
-
]:
|
350 |
-
selector.change(
|
351 |
-
update_table,
|
352 |
-
[
|
353 |
-
hidden_leaderboard_table_for_search,
|
354 |
-
shown_columns,
|
355 |
-
search_bar,
|
356 |
-
filter_columns_type,
|
357 |
-
filter_domain_specific,
|
358 |
-
filter_columns_size
|
359 |
-
# filter_columns_architecture,
|
360 |
-
],
|
361 |
-
leaderboard_table,
|
362 |
-
queue=True,
|
363 |
-
)
|
364 |
|
365 |
with gr.TabItem("π
Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
|
366 |
with gr.Row():
|
@@ -938,7 +827,119 @@ with demo:
|
|
938 |
with gr.Accordion("Question generation", open=False):
|
939 |
system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
|
940 |
with gr.Accordion("Cross Examination", open=False):
|
941 |
-
system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
942 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=5):
|
943 |
gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
|
944 |
gr.HTML(FIVE_PILLAR_DIAGRAM)
|
|
|
250 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
251 |
|
252 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
|
254 |
with gr.TabItem("π
Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
|
255 |
with gr.Row():
|
|
|
827 |
with gr.Accordion("Question generation", open=False):
|
828 |
system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
|
829 |
with gr.Accordion("Cross Examination", open=False):
|
830 |
+
system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
|
831 |
+
with gr.TabItem("π
Closed Ended Evaluation", elem_id="llm-benchmark-tab-table", id=0):
|
832 |
+
with gr.Row():
|
833 |
+
with gr.Column():
|
834 |
+
with gr.Row():
|
835 |
+
search_bar = gr.Textbox(
|
836 |
+
placeholder=" π Search for your model (separate multiple queries with `;`) and press ENTER...",
|
837 |
+
show_label=False,
|
838 |
+
elem_id="search-bar",
|
839 |
+
)
|
840 |
+
with gr.Row():
|
841 |
+
shown_columns = gr.CheckboxGroup(
|
842 |
+
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
|
843 |
+
value=[
|
844 |
+
c.name
|
845 |
+
for c in fields(AutoEvalColumn)
|
846 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)
|
847 |
+
],
|
848 |
+
label="Select columns to show",
|
849 |
+
elem_id="column-select",
|
850 |
+
interactive=True,
|
851 |
+
)
|
852 |
+
# with gr.Row():
|
853 |
+
# deleted_models_visibility = gr.Checkbox(
|
854 |
+
# value=False, label="Show gated/private/deleted models", interactive=True
|
855 |
+
# )
|
856 |
+
with gr.Column(min_width=320):
|
857 |
+
# with gr.Box(elem_id="box-filter"):
|
858 |
+
filter_columns_type = gr.CheckboxGroup(
|
859 |
+
label="Model Types",
|
860 |
+
choices=[t.to_str() for t in ModelType],
|
861 |
+
value=[t.to_str() for t in ModelType],
|
862 |
+
interactive=True,
|
863 |
+
elem_id="filter-columns-type",
|
864 |
+
)
|
865 |
+
# filter_columns_architecture = gr.CheckboxGroup(
|
866 |
+
# label="Architecture Types",
|
867 |
+
# choices=[i.value.name for i in ModelArch],
|
868 |
+
# value=[i.value.name for i in ModelArch],
|
869 |
+
# interactive=True,
|
870 |
+
# elem_id="filter-columns-architecture",
|
871 |
+
# )
|
872 |
+
filter_domain_specific = gr.CheckboxGroup(
|
873 |
+
label="Domain Specificity",
|
874 |
+
choices=["π₯ Clinical models", "Generic models"],
|
875 |
+
value=["π₯ Clinical models", "Generic models"],
|
876 |
+
interactive=True,
|
877 |
+
elem_id="filter-columns-type",
|
878 |
+
)
|
879 |
+
filter_columns_size = gr.CheckboxGroup(
|
880 |
+
label="Model sizes (in billions of parameters)",
|
881 |
+
choices=list(NUMERIC_INTERVALS.keys()),
|
882 |
+
value=list(NUMERIC_INTERVALS.keys()),
|
883 |
+
interactive=True,
|
884 |
+
elem_id="filter-columns-size",
|
885 |
+
)
|
886 |
+
|
887 |
+
datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
|
888 |
+
|
889 |
+
leaderboard_table = gr.components.Dataframe(
|
890 |
+
value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
891 |
+
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
892 |
+
datatype=TYPES,
|
893 |
+
elem_id="leaderboard-table",
|
894 |
+
interactive=False,
|
895 |
+
visible=True,
|
896 |
+
)
|
897 |
+
|
898 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
899 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
900 |
+
value=datasets_original_df[DATASET_COLS],
|
901 |
+
headers=DATASET_COLS,
|
902 |
+
datatype=TYPES,
|
903 |
+
visible=False,
|
904 |
+
)
|
905 |
+
|
906 |
+
|
907 |
+
search_bar.submit(
|
908 |
+
update_table,
|
909 |
+
[
|
910 |
+
hidden_leaderboard_table_for_search,
|
911 |
+
shown_columns,
|
912 |
+
search_bar,
|
913 |
+
filter_columns_type,
|
914 |
+
filter_domain_specific,
|
915 |
+
filter_columns_size
|
916 |
+
# filter_columns_architecture
|
917 |
+
],
|
918 |
+
leaderboard_table,
|
919 |
+
)
|
920 |
+
for selector in [
|
921 |
+
shown_columns,
|
922 |
+
filter_columns_type,
|
923 |
+
filter_domain_specific,
|
924 |
+
# filter_columns_architecture,
|
925 |
+
filter_columns_size,
|
926 |
+
# deleted_models_visibility,
|
927 |
+
]:
|
928 |
+
selector.change(
|
929 |
+
update_table,
|
930 |
+
[
|
931 |
+
hidden_leaderboard_table_for_search,
|
932 |
+
shown_columns,
|
933 |
+
search_bar,
|
934 |
+
filter_columns_type,
|
935 |
+
filter_domain_specific,
|
936 |
+
filter_columns_size
|
937 |
+
# filter_columns_architecture,
|
938 |
+
],
|
939 |
+
leaderboard_table,
|
940 |
+
queue=True,
|
941 |
+
)
|
942 |
+
|
943 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=5):
|
944 |
gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
|
945 |
gr.HTML(FIVE_PILLAR_DIAGRAM)
|