File size: 5,640 Bytes
65c6479
441cdc8
 
0e06db3
97d7225
21f1468
65c6479
9c1b957
 
 
35957e0
9c1b957
 
 
 
60867e4
9c1b957
 
97d7225
541cf85
9c1b957
e608ddc
 
0e06db3
85b9042
e608ddc
 
 
 
 
 
 
65c6479
97d7225
85b9042
97d7225
4ecf403
 
 
 
 
 
 
 
 
 
541cf85
9e094e7
4ecf403
 
 
a55b227
9c1b957
 
 
2678c49
a55b227
60867e4
9c1b957
4ecf403
21f1468
 
4ecf403
21f1468
4ecf403
21f1468
 
 
 
 
 
 
 
 
 
4ecf403
7bd7d77
4ecf403
 
cfb8d80
21f1468
4ecf403
21f1468
4ecf403
21f1468
 
 
 
 
 
 
 
 
 
 
 
4ecf403
7157d11
c69a5b0
21f1468
cfb8d80
4ecf403
6c84d42
4ecf403
21f1468
4ecf403
21f1468
 
 
 
 
 
 
 
 
 
 
 
4ecf403
6c84d42
21f1468
 
6c84d42
040103b
cfb8d80
4d3390f
c71e3de
 
 
 
 
 
 
 
 
35957e0
441cdc8
2678c49
97d7225
 
cfb8d80
97d7225
653c0f4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import gradio as gr
import pandas as pd
import os
from huggingface_hub import snapshot_download, login
from apscheduler.schedulers.background import BackgroundScheduler
from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter

from src.display.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    CONTACT_TEXT,
    EVALUATION_QUEUE_TEXT,
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
    SUB_TITLE,
)
from src.display.css_html_js import custom_css
from src.envs import API
from src.leaderboard.load_results import load_data

# clone / pull the lmeh eval data
TOKEN = os.environ.get("TOKEN", None)
login(token=TOKEN)
RESULTS_REPO = f"SeaLLMs/SeaExam-results"
CACHE_PATH=os.getenv("HF_HOME", ".")
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
print(EVAL_RESULTS_PATH)
snapshot_download(
    repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", 
    token=TOKEN
)

def restart_space():
    API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)

all_columns = ['R', 'Model', 'type', 'open?', 'avg-pub', 'avg-prv ⬇️', 'id-pub',
       'th-pub', 'vi-pub', 'id-prv', 'th-prv', 'vi-prv', '#P(B)']
show_columns = ['R', 'Model','type','open?','#P(B)', 'avg-pub', 'avg-prv ⬇️', 
        'id-pub', 'th-pub', 'vi-pub', 'id-prv', 'th-prv', 'vi-prv']
TYPES = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']

show_columns_overall = ['R', 'Model', 'type', 'open?','#P(B)', 'SeaExam-pub', 'SeaExam-prv ⬇️',
                'SeaBench-pub', 'SeaBench-prv']
TYPES_overall = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number']

# Load the data from the csv file
csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20250318.csv'
df = pd.read_csv(csv_path, skiprows=1, header=0)
# df_m3exam, df_mmlu, df_avg = load_data(csv_path)
df_seaexam, df_seabench, df_overall = load_data(csv_path)

demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(TITLE)
    # gr.HTML(SUB_TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
    
    with gr.Tabs(elem_classes="tab-buttons") as tabs:

        with gr.Tab("🏅 Overall"):
            Leaderboard(
                value=df_overall[show_columns_overall],
                select_columns=SelectColumns(
                    default_selection=show_columns_overall,
                    cant_deselect=["R", "Model"],
                    label="Select Columns to Display:",
                ),
                search_columns=["Model"],
                # hide_columns=["model_name_for_query", "Model Size"],
                filter_columns=[
                    "type",
                    "open?",
                    # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
                    # ColumnFilter("Flagged", type="boolean", default=False),
                    ColumnFilter("#P(B)", default=[7, 9], label="Paramers(B)"),
                ],
                datatype=TYPES_overall,
                # column_widths=["3%", "20%", "6%", "4%"]
            )
        
        with gr.Tab("SeaExam"):
            Leaderboard(
                value=df_seaexam[show_columns],
                select_columns=SelectColumns(
                    default_selection=show_columns,
                    cant_deselect=["R", "Model"],
                    label="Select Columns to Display:",
                ),
                search_columns=["Model"],
                # hide_columns=["model_name_for_query", "Model Size"],
                filter_columns=[
                    "type",
                    "open?",
                    # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
                    # ColumnFilter("Flagged", type="boolean", default=False),
                    ColumnFilter("#P(B)", default=[7, 9]),
                ],
                datatype=TYPES,
                # column_widths=["2%", "33%"],
            )
        

        with gr.Tab("SeaBench"):
            Leaderboard(
                value=df_seabench[show_columns],
                select_columns=SelectColumns(
                    default_selection=show_columns,
                    cant_deselect=["R", "Model"],
                    label="Select Columns to Display:",
                ),
                search_columns=["Model"],
                # hide_columns=["model_name_for_query", "Model Size"],
                filter_columns=[
                    "type",
                    "open?",
                    # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
                    # ColumnFilter("Flagged", type="boolean", default=False),
                    ColumnFilter("#P(B)", default=[7, 9]),
                ],
                datatype=TYPES,
                # column_widths=["2%", "33%"],
            )

        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
    # with gr.Row():
    #     with gr.Accordion("📙 Citation", open=False):
    #         citation_button = gr.Textbox(
    #             value=CITATION_BUTTON_TEXT,
    #             label=CITATION_BUTTON_LABEL,
    #             lines=20,
    #             elem_id="citation-button",
    #             show_copy_button=True,
    #         )
    gr.Markdown(CONTACT_TEXT, elem_classes="markdown-text")

demo.launch(share=True)

scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch(share=True)