lukecq's picture
update scripts
85b9042
raw
history blame
4.6 kB
import gradio as gr
import pandas as pd
import os
from huggingface_hub import snapshot_download
from apscheduler.schedulers.background import BackgroundScheduler
from src.display.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.envs import API
# clone / pull the lmeh eval data
TOKEN = os.environ.get("TOKEN", None)
RESULTS_REPO = f"SeaLLMs/SeaExam-results"
CACHE_PATH=os.getenv("HF_HOME", ".")
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
print(EVAL_RESULTS_PATH)
snapshot_download(
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset",
token=TOKEN
)
def restart_space():
API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
# Load the CSV file
# def load_csv(file_path):
# data = pd.read_csv(file_path)
# return data
def load_data(data_path):
df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
columns_sorted = ['Model', 'type', 'open?', 'shot', 'avg', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi']
# Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
df_m3exam = df.iloc[:, :11] # M3Exam columns
df_mmlu = df.iloc[:, [0, 1, 2, 3, 11, 12, 13, 14, 15, 16, 17]] # MMLU columns
df_avg = df.iloc[:, [0, 1, 2, 3, 18, 19, 20, 21, 22, 23, 24]] # Average columns
df_mmlu.columns = columns
df_avg.columns = columns
# # multiply the values in the ['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea'] by 100 and display as 1 decimal
for df_tmp in [df_m3exam, df_mmlu, df_avg]:
df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] *= 100
df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] = df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']].round(2)
# change the order of the columns to ['Model', 'type', 'open?', 'shot', 'avg', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi']
# and sort the columns by 'avg' in descending order
df_m3exam = df_m3exam[columns_sorted].sort_values(by='avg', ascending=False)
df_mmlu = df_mmlu[columns_sorted].sort_values(by='avg', ascending=False)
df_avg = df_avg[columns_sorted].sort_values(by='avg', ascending=False)
return df_m3exam, df_mmlu, df_avg
# Example path to your CSV file
csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results.csv'
# data = load_csv(csv_path)
df_m3exam, df_mmlu, df_avg = load_data(csv_path)
# def show_data():
# return data
# iface = gr.Interface(fn=show_data, inputs = None, outputs="dataframe", title="SeaExam Leaderboard",
# description="Leaderboard for the SeaExam competition.")
# iface.launch()
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
leaderboard_table = gr.components.Dataframe(
value=df_avg,
# value=leaderboard_df[
# [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
# + shown_columns.value
# + [AutoEvalColumn.dummy.name]
# ],
# headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
# datatype=TYPES,
# elem_id="leaderboard-table",
interactive=False,
visible=True,
# column_widths=["2%", "33%"]
)
with gr.TabItem("πŸ… M3Exam", elem_id="llm-benchmark-M3Exam", id=1):
leaderboard_table = gr.components.Dataframe(
value=df_m3exam,
interactive=False,
visible=True,
)
with gr.TabItem("πŸ… MMLU", elem_id="llm-benchmark-MMLU", id=2):
leaderboard_table = gr.components.Dataframe(
value=df_mmlu,
interactive=False,
visible=True,
)
with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=3):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
demo.launch()
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch(share=True)