Spaces:

SeaLLMs
/

LLM_Leaderboard_for_SEA

Running

File size: 5,622 Bytes

65c6479
441cdc8
 
0e06db3
97d7225
21f1468
65c6479
9c1b957
 
 
35957e0
9c1b957
 
 
 
60867e4
9c1b957
 
97d7225
541cf85
9c1b957
e608ddc
 
0e06db3
85b9042
e608ddc
 
 
 
 
 
 
65c6479
97d7225
85b9042
97d7225
4ecf403
 
 
 
 
 
 
 
 
 
541cf85
a3e5824
4ecf403
 
 
a55b227
9c1b957
 
 
2678c49
a55b227
60867e4
9c1b957
4ecf403
21f1468
 
4ecf403
21f1468
4ecf403
21f1468
 
 
 
 
 
 
 
 
 
4ecf403
7bd7d77
4ecf403
 
cfb8d80
21f1468
4ecf403
21f1468
4ecf403
21f1468
 
 
 
 
 
 
 
 
 
 
 
4ecf403
7157d11
c69a5b0
21f1468
cfb8d80
4ecf403
6c84d42
4ecf403
21f1468
4ecf403
21f1468
 
 
 
 
 
 
 
 
 
 
 
4ecf403
6c84d42
21f1468
 
6c84d42
040103b
cfb8d80
4d3390f
1c09f6d
 
 
 
 
 
 
 
 
35957e0
441cdc8
2678c49
97d7225
 
cfb8d80
97d7225
653c0f4

import gradio as gr
import pandas as pd
import os
from huggingface_hub import snapshot_download, login
from apscheduler.schedulers.background import BackgroundScheduler
from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter

from src.display.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    CONTACT_TEXT,
    EVALUATION_QUEUE_TEXT,
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
    SUB_TITLE,
)
from src.display.css_html_js import custom_css
from src.envs import API
from src.leaderboard.load_results import load_data

# clone / pull the lmeh eval data
TOKEN = os.environ.get("TOKEN", None)
login(token=TOKEN)
RESULTS_REPO = f"SeaLLMs/SeaExam-results"
CACHE_PATH=os.getenv("HF_HOME", ".")
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
print(EVAL_RESULTS_PATH)
snapshot_download(
    repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", 
    token=TOKEN
)

def restart_space():
    API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)

all_columns = ['R', 'Model', 'type', 'open?', 'avg-pub', 'avg-prv ⬇️', 'id-pub',
       'th-pub', 'vi-pub', 'id-prv', 'th-prv', 'vi-prv', '#P(B)']
show_columns = ['R', 'Model','type','open?','#P(B)', 'avg-pub', 'avg-prv ⬇️', 
        'id-pub', 'th-pub', 'vi-pub', 'id-prv', 'th-prv', 'vi-prv']
TYPES = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']

show_columns_overall = ['R', 'Model', 'type', 'open?','#P(B)', 'SeaExam-pub', 'SeaExam-prv ⬇️',
                'SeaBench-pub', 'SeaBench-prv']
TYPES_overall = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number']

# Load the data from the csv file
csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20250415.csv'
df = pd.read_csv(csv_path, skiprows=1, header=0)
# df_m3exam, df_mmlu, df_avg = load_data(csv_path)
df_seaexam, df_seabench, df_overall = load_data(csv_path)

demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(TITLE)
    # gr.HTML(SUB_TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
    
    with gr.Tabs(elem_classes="tab-buttons") as tabs:

        with gr.Tab("🏅 Overall"):
            Leaderboard(
                value=df_overall[show_columns_overall],
                select_columns=SelectColumns(
                    default_selection=show_columns_overall,
                    cant_deselect=["R", "Model"],
                    label="Select Columns to Display:",
                ),
                search_columns=["Model"],
                # hide_columns=["model_name_for_query", "Model Size"],
                filter_columns=[
                    "type",
                    "open?",
                    # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
                    # ColumnFilter("Flagged", type="boolean", default=False),
                    ColumnFilter("#P(B)", default=[7, 9], label="Paramers(B)"),
                ],
                datatype=TYPES_overall,
                # column_widths=["3%", "20%", "6%", "4%"]
            )
        
        with gr.Tab("SeaExam"):
            Leaderboard(
                value=df_seaexam[show_columns],
                select_columns=SelectColumns(
                    default_selection=show_columns,
                    cant_deselect=["R", "Model"],
                    label="Select Columns to Display:",
                ),
                search_columns=["Model"],
                # hide_columns=["model_name_for_query", "Model Size"],
                filter_columns=[
                    "type",
                    "open?",
                    # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
                    # ColumnFilter("Flagged", type="boolean", default=False),
                    ColumnFilter("#P(B)", default=[7, 9]),
                ],
                datatype=TYPES,
                # column_widths=["2%", "33%"],
            )
        

        with gr.Tab("SeaBench"):
            Leaderboard(
                value=df_seabench[show_columns],
                select_columns=SelectColumns(
                    default_selection=show_columns,
                    cant_deselect=["R", "Model"],
                    label="Select Columns to Display:",
                ),
                search_columns=["Model"],
                # hide_columns=["model_name_for_query", "Model Size"],
                filter_columns=[
                    "type",
                    "open?",
                    # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
                    # ColumnFilter("Flagged", type="boolean", default=False),
                    ColumnFilter("#P(B)", default=[7, 9]),
                ],
                datatype=TYPES,
                # column_widths=["2%", "33%"],
            )

        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
    with gr.Row():
        with gr.Accordion("📙 Citation", open=False):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                lines=20,
                elem_id="citation-button",
                show_copy_button=True,
            )
    gr.Markdown(CONTACT_TEXT, elem_classes="markdown-text")

demo.launch(share=True)

scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch(share=True)