Spaces:
Running
Running
update results
Browse files- app.py +12 -12
- src/display/about.py +24 -9
app.py
CHANGED
@@ -45,7 +45,7 @@ show_columns_overall = ['R', 'Model', 'type', 'open?','#P(B)', 'SeaExam-pub', 'S
|
|
45 |
TYPES_overall = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number']
|
46 |
|
47 |
# Load the data from the csv file
|
48 |
-
csv_path = f'{EVAL_RESULTS_PATH}/
|
49 |
# csv_path = f'eval-results/SeaExam_results_20241030.csv'
|
50 |
df = pd.read_csv(csv_path, skiprows=1, header=0)
|
51 |
# df_m3exam, df_mmlu, df_avg = load_data(csv_path)
|
@@ -54,7 +54,7 @@ df_seaexam, df_seabench, df_overall = load_data(csv_path)
|
|
54 |
demo = gr.Blocks(css=custom_css)
|
55 |
with demo:
|
56 |
gr.HTML(TITLE)
|
57 |
-
gr.HTML(SUB_TITLE)
|
58 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
59 |
|
60 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
@@ -125,18 +125,18 @@ with demo:
|
|
125 |
|
126 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=3):
|
127 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
gr.Markdown(CONTACT_TEXT, elem_classes="markdown-text")
|
138 |
|
139 |
-
demo.launch()
|
140 |
|
141 |
scheduler = BackgroundScheduler()
|
142 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
|
|
45 |
TYPES_overall = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number']
|
46 |
|
47 |
# Load the data from the csv file
|
48 |
+
csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20241122.csv'
|
49 |
# csv_path = f'eval-results/SeaExam_results_20241030.csv'
|
50 |
df = pd.read_csv(csv_path, skiprows=1, header=0)
|
51 |
# df_m3exam, df_mmlu, df_avg = load_data(csv_path)
|
|
|
54 |
demo = gr.Blocks(css=custom_css)
|
55 |
with demo:
|
56 |
gr.HTML(TITLE)
|
57 |
+
# gr.HTML(SUB_TITLE)
|
58 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
59 |
|
60 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
|
|
125 |
|
126 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=3):
|
127 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
128 |
+
with gr.Row():
|
129 |
+
with gr.Accordion("π Citation", open=False):
|
130 |
+
citation_button = gr.Textbox(
|
131 |
+
value=CITATION_BUTTON_TEXT,
|
132 |
+
label=CITATION_BUTTON_LABEL,
|
133 |
+
lines=20,
|
134 |
+
elem_id="citation-button",
|
135 |
+
show_copy_button=True,
|
136 |
+
)
|
137 |
gr.Markdown(CONTACT_TEXT, elem_classes="markdown-text")
|
138 |
|
139 |
+
demo.launch(share=True)
|
140 |
|
141 |
scheduler = BackgroundScheduler()
|
142 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
src/display/about.py
CHANGED
@@ -16,10 +16,11 @@ class Tasks(Enum):
|
|
16 |
|
17 |
|
18 |
# Your leaderboard name
|
19 |
-
TITLE = """<h1 align="center" id="space-title">π SeaExam and SeaBench Leaderboard</h1>"""
|
|
|
20 |
|
21 |
# subtitle
|
22 |
-
SUB_TITLE = """<h2 align="
|
23 |
|
24 |
# What does your leaderboard evaluate?
|
25 |
# INTRODUCTION_TEXT = """
|
@@ -34,6 +35,14 @@ INTRODUCTION_TEXT = """
|
|
34 |
This leaderboard evaluates Large Language Models (LLMs) on Southeast Asian (SEA) languages through two comprehensive benchmarks: SeaExam and SeaBench. SeaExam assesses world knowledge and reasoning capabilities through exam-style questions, while SeaBench evaluates instruction-following abilities and multi-turn conversational skills. For detailed methodology and results, please refer to the "π About" tab.
|
35 |
"""
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
# For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "π About" tab.
|
38 |
|
39 |
# Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
|
@@ -46,7 +55,7 @@ Even though large language models (LLMs) have shown impressive performance on va
|
|
46 |
|
47 |
|
48 |
## Datasets
|
49 |
-
The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/datasets/SeaLLMs/SeaExam) and SeaBench dataset
|
50 |
- **SeaExam**: a benchmark sourced from real and official human exam questions in multiple-choice format.
|
51 |
- **SeaBench**: a manually created benchmark for evaluating the model's ability to follow instructions and engage in multi-turn conversations. The questions are in open-ended format.
|
52 |
|
@@ -59,7 +68,7 @@ The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/
|
|
59 |
_ **SeaBench**:
|
60 |
We evaluate the responses of the models with GPT-4o-2024-08-06. Each response is scored on a scale of 1-10.
|
61 |
|
62 |
-
##
|
63 |
How to interpret the leaderboard?
|
64 |
* Each numerical value represet the accuracy (%) for SeaExam and score for SeaBench.
|
65 |
* The "π
Overall" shows the average results across the three langauges for SeaExam public dataset (SeaExam-pub), SeaExam private dataset (SeaExam-prv), SeaBench public dataset (SeaBench-pub), (SeaBench-prv). This leaderboard is ranked by SeaExam-prv.
|
@@ -69,13 +78,13 @@ How to interpret the leaderboard?
|
|
69 |
* "open?" column indicates whether the model is open-source or proprietary.
|
70 |
|
71 |
## Reproducibility
|
72 |
-
To reproduce our results, use the script in [
|
73 |
-
```python
|
74 |
-
python scripts/main.py --model $model_name_or_path
|
75 |
-
```
|
76 |
-
|
77 |
"""
|
78 |
|
|
|
|
|
|
|
|
|
79 |
# You can find the detailed numerical results in the results Hugging Face dataset: https://huggingface.co/datasets/SeaLLMs/SeaExam-results
|
80 |
|
81 |
EVALUATION_QUEUE_TEXT = """
|
@@ -110,6 +119,12 @@ If everything is done, check you can launch the EleutherAIHarness on your model
|
|
110 |
|
111 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
112 |
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
"""
|
114 |
|
115 |
CONTACT_TEXT = f"""
|
|
|
16 |
|
17 |
|
18 |
# Your leaderboard name
|
19 |
+
# TITLE = """<h1 align="center" id="space-title">π SeaExam and SeaBench Leaderboard</h1>"""
|
20 |
+
TITLE = """<h1 align="left" id="space-title">π
LLM Leaderboard for SEA</h1>"""
|
21 |
|
22 |
# subtitle
|
23 |
+
SUB_TITLE = """<h2 align="left" id="space-title">What is the best LLM for Southeast Asian Languagesβ</h1>"""
|
24 |
|
25 |
# What does your leaderboard evaluate?
|
26 |
# INTRODUCTION_TEXT = """
|
|
|
35 |
This leaderboard evaluates Large Language Models (LLMs) on Southeast Asian (SEA) languages through two comprehensive benchmarks: SeaExam and SeaBench. SeaExam assesses world knowledge and reasoning capabilities through exam-style questions, while SeaBench evaluates instruction-following abilities and multi-turn conversational skills. For detailed methodology and results, please refer to the "π About" tab.
|
36 |
"""
|
37 |
|
38 |
+
INTRODUCTION_TEXT = """
|
39 |
+
This leaderboard evaluates Large Language Models (LLMs) on Southeast Asian (SEA) languages through two comprehensive benchmarks: SeaExam and SeaBench:
|
40 |
+
* SeaExam assesses world knowledge and reasoning capabilities through exam-style questions [[data (public)](https://huggingface.co/datasets/SeaLLMs/SeaExam)] [[code](https://github.com/DAMO-NLP-SG/SeaExam)]
|
41 |
+
* SeaBench evaluates instruction-following abilities and multi-turn conversational skills. [[data (public)](https://huggingface.co/datasets/SeaLLMs/SeaBench)] [[code](https://github.com/DAMO-NLP-SG/SeaBench?tab=readme-ov-file)]
|
42 |
+
|
43 |
+
Note: "pub" denotes public dataset, and "prv" denotes private dataset.
|
44 |
+
For more details, please refer to the "π About" tab.
|
45 |
+
"""
|
46 |
# For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "π About" tab.
|
47 |
|
48 |
# Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
|
|
|
55 |
|
56 |
|
57 |
## Datasets
|
58 |
+
The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/datasets/SeaLLMs/SeaExam) and [SeaBench dataset](https://huggingface.co/datasets/SeaLLMs/SeaBench).
|
59 |
- **SeaExam**: a benchmark sourced from real and official human exam questions in multiple-choice format.
|
60 |
- **SeaBench**: a manually created benchmark for evaluating the model's ability to follow instructions and engage in multi-turn conversations. The questions are in open-ended format.
|
61 |
|
|
|
68 |
_ **SeaBench**:
|
69 |
We evaluate the responses of the models with GPT-4o-2024-08-06. Each response is scored on a scale of 1-10.
|
70 |
|
71 |
+
## Results
|
72 |
How to interpret the leaderboard?
|
73 |
* Each numerical value represet the accuracy (%) for SeaExam and score for SeaBench.
|
74 |
* The "π
Overall" shows the average results across the three langauges for SeaExam public dataset (SeaExam-pub), SeaExam private dataset (SeaExam-prv), SeaBench public dataset (SeaBench-pub), (SeaBench-prv). This leaderboard is ranked by SeaExam-prv.
|
|
|
78 |
* "open?" column indicates whether the model is open-source or proprietary.
|
79 |
|
80 |
## Reproducibility
|
81 |
+
To reproduce our results, use the script in [SeaExam](https://github.com/DAMO-NLP-SG/SeaExam/tree/main) and [SeaBench](https://github.com/DAMO-NLP-SG/SeaBench). The script will download the model and tokenizer, and evaluate the model on the benchmark data.
|
|
|
|
|
|
|
|
|
82 |
"""
|
83 |
|
84 |
+
# ```python
|
85 |
+
# python scripts/main.py --model $model_name_or_path
|
86 |
+
# ```
|
87 |
+
|
88 |
# You can find the detailed numerical results in the results Hugging Face dataset: https://huggingface.co/datasets/SeaLLMs/SeaExam-results
|
89 |
|
90 |
EVALUATION_QUEUE_TEXT = """
|
|
|
119 |
|
120 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
121 |
CITATION_BUTTON_TEXT = r"""
|
122 |
+
@article{damonlp2024sealeaderboard,
|
123 |
+
author = {Chaoqun Liu, Wenxuan Zhang, Jiahao Ying, Mahani Aljunied, Anh Tuan Luu, Lidong Bing},
|
124 |
+
title = {SeaExam and SeaBench: Benchmarking LLMs with Local Multilingual Questions in Southeast Asia},
|
125 |
+
year = {2024},
|
126 |
+
url = {},
|
127 |
+
}
|
128 |
"""
|
129 |
|
130 |
CONTACT_TEXT = f"""
|