import gradio as gr import pandas as pd import os from huggingface_hub import snapshot_download from apscheduler.schedulers.background import BackgroundScheduler from src.display.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.envs import API # clone / pull the lmeh eval data TOKEN = os.environ.get("TOKEN", None) RESULTS_REPO = f"lukecq/SeaExam-results" CACHE_PATH=os.getenv("HF_HOME", ".") EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results") print(EVAL_RESULTS_PATH) snapshot_download( repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", token=TOKEN ) def restart_space(): API.restart_space(repo_id="lukecq/SeaExam_leaderboard", token=TOKEN) # Load the CSV file # def load_csv(file_path): # data = pd.read_csv(file_path) # return data def load_data(data_path): df = pd.read_csv(data_path, skiprows=1, header=0).dropna() columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea'] columns_sorted = ['Model', 'type', 'open?', 'shot', 'avg', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi'] # Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average df_m3exam = df.iloc[:, :11] # M3Exam columns df_mmlu = df.iloc[:, [0, 1, 2, 3, 11, 12, 13, 14, 15, 16, 17]] # MMLU columns df_avg = df.iloc[:, [0, 1, 2, 3, 18, 19, 20, 21, 22, 23, 24]] # Average columns df_mmlu.columns = columns df_avg.columns = columns # # multiply the values in the ['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea'] by 100 and display as 1 decimal for df_tmp in [df_m3exam, df_mmlu, df_avg]: df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] *= 100 df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] = df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']].round(2) # change the order of the columns to ['Model', 'type', 'open?', 'shot', 'avg', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi'] # and sort the columns by 'avg' in descending order df_m3exam = df_m3exam[columns_sorted].sort_values(by='avg', ascending=False) df_mmlu = df_mmlu[columns_sorted].sort_values(by='avg', ascending=False) df_avg = df_avg[columns_sorted].sort_values(by='avg', ascending=False) return df_m3exam, df_mmlu, df_avg # Example path to your CSV file csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results.csv' # data = load_csv(csv_path) df_m3exam, df_mmlu, df_avg = load_data(csv_path) # def show_data(): # return data # iface = gr.Interface(fn=show_data, inputs = None, outputs="dataframe", title="SeaExam Leaderboard", # description="Leaderboard for the SeaExam competition.") # iface.launch() demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0): leaderboard_table = gr.components.Dataframe( value=df_avg, # value=leaderboard_df[ # [c.name for c in fields(AutoEvalColumn) if c.never_hidden] # + shown_columns.value # + [AutoEvalColumn.dummy.name] # ], # headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value, # datatype=TYPES, # elem_id="leaderboard-table", interactive=False, visible=True, # column_widths=["2%", "33%"] ) with gr.TabItem("🏅 M3Exam", elem_id="llm-benchmark-M3Exam", id=1): leaderboard_table = gr.components.Dataframe( value=df_m3exam, interactive=False, visible=True, ) with gr.TabItem("🏅 MMLU", elem_id="llm-benchmark-MMLU", id=2): leaderboard_table = gr.components.Dataframe( value=df_mmlu, interactive=False, visible=True, ) with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3): gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") demo.launch() scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() demo.queue(default_concurrency_limit=40).launch(share=True)