lukecq commited on
Commit
2678c49
Β·
1 Parent(s): 4ecf403

update results

Browse files
Files changed (2) hide show
  1. app.py +12 -12
  2. src/display/about.py +24 -9
app.py CHANGED
@@ -45,7 +45,7 @@ show_columns_overall = ['R', 'Model', 'type', 'open?','#P(B)', 'SeaExam-pub', 'S
45
  TYPES_overall = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number']
46
 
47
  # Load the data from the csv file
48
- csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20241030.csv'
49
  # csv_path = f'eval-results/SeaExam_results_20241030.csv'
50
  df = pd.read_csv(csv_path, skiprows=1, header=0)
51
  # df_m3exam, df_mmlu, df_avg = load_data(csv_path)
@@ -54,7 +54,7 @@ df_seaexam, df_seabench, df_overall = load_data(csv_path)
54
  demo = gr.Blocks(css=custom_css)
55
  with demo:
56
  gr.HTML(TITLE)
57
- gr.HTML(SUB_TITLE)
58
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
59
 
60
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
@@ -125,18 +125,18 @@ with demo:
125
 
126
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=3):
127
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
128
- # with gr.Row():
129
- # with gr.Accordion("πŸ“™ Citation", open=False):
130
- # citation_button = gr.Textbox(
131
- # value=CITATION_BUTTON_TEXT,
132
- # label=CITATION_BUTTON_LABEL,
133
- # lines=20,
134
- # elem_id="citation-button",
135
- # show_copy_button=True,
136
- # )
137
  gr.Markdown(CONTACT_TEXT, elem_classes="markdown-text")
138
 
139
- demo.launch()
140
 
141
  scheduler = BackgroundScheduler()
142
  scheduler.add_job(restart_space, "interval", seconds=1800)
 
45
  TYPES_overall = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number']
46
 
47
  # Load the data from the csv file
48
+ csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20241122.csv'
49
  # csv_path = f'eval-results/SeaExam_results_20241030.csv'
50
  df = pd.read_csv(csv_path, skiprows=1, header=0)
51
  # df_m3exam, df_mmlu, df_avg = load_data(csv_path)
 
54
  demo = gr.Blocks(css=custom_css)
55
  with demo:
56
  gr.HTML(TITLE)
57
+ # gr.HTML(SUB_TITLE)
58
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
59
 
60
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
 
125
 
126
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=3):
127
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
128
+ with gr.Row():
129
+ with gr.Accordion("πŸ“™ Citation", open=False):
130
+ citation_button = gr.Textbox(
131
+ value=CITATION_BUTTON_TEXT,
132
+ label=CITATION_BUTTON_LABEL,
133
+ lines=20,
134
+ elem_id="citation-button",
135
+ show_copy_button=True,
136
+ )
137
  gr.Markdown(CONTACT_TEXT, elem_classes="markdown-text")
138
 
139
+ demo.launch(share=True)
140
 
141
  scheduler = BackgroundScheduler()
142
  scheduler.add_job(restart_space, "interval", seconds=1800)
src/display/about.py CHANGED
@@ -16,10 +16,11 @@ class Tasks(Enum):
16
 
17
 
18
  # Your leaderboard name
19
- TITLE = """<h1 align="center" id="space-title">πŸ“ƒ SeaExam and SeaBench Leaderboard</h1>"""
 
20
 
21
  # subtitle
22
- SUB_TITLE = """<h2 align="center" id="space-title">What is the best LLM for Southeast Asian Languages❓</h1>"""
23
 
24
  # What does your leaderboard evaluate?
25
  # INTRODUCTION_TEXT = """
@@ -34,6 +35,14 @@ INTRODUCTION_TEXT = """
34
  This leaderboard evaluates Large Language Models (LLMs) on Southeast Asian (SEA) languages through two comprehensive benchmarks: SeaExam and SeaBench. SeaExam assesses world knowledge and reasoning capabilities through exam-style questions, while SeaBench evaluates instruction-following abilities and multi-turn conversational skills. For detailed methodology and results, please refer to the "πŸ“ About" tab.
35
  """
36
 
 
 
 
 
 
 
 
 
37
  # For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "πŸ“ About" tab.
38
 
39
  # Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
@@ -46,7 +55,7 @@ Even though large language models (LLMs) have shown impressive performance on va
46
 
47
 
48
  ## Datasets
49
- The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/datasets/SeaLLMs/SeaExam) and SeaBench dataset (will be public available soon).
50
  - **SeaExam**: a benchmark sourced from real and official human exam questions in multiple-choice format.
51
  - **SeaBench**: a manually created benchmark for evaluating the model's ability to follow instructions and engage in multi-turn conversations. The questions are in open-ended format.
52
 
@@ -59,7 +68,7 @@ The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/
59
  _ **SeaBench**:
60
  We evaluate the responses of the models with GPT-4o-2024-08-06. Each response is scored on a scale of 1-10.
61
 
62
- ## Reults
63
  How to interpret the leaderboard?
64
  * Each numerical value represet the accuracy (%) for SeaExam and score for SeaBench.
65
  * The "πŸ… Overall" shows the average results across the three langauges for SeaExam public dataset (SeaExam-pub), SeaExam private dataset (SeaExam-prv), SeaBench public dataset (SeaBench-pub), (SeaBench-prv). This leaderboard is ranked by SeaExam-prv.
@@ -69,13 +78,13 @@ How to interpret the leaderboard?
69
  * "open?" column indicates whether the model is open-source or proprietary.
70
 
71
  ## Reproducibility
72
- To reproduce our results, use the script in [this repo](https://github.com/DAMO-NLP-SG/SeaExam/tree/main). The script will download the model and tokenizer, and evaluate the model on the benchmark data.
73
- ```python
74
- python scripts/main.py --model $model_name_or_path
75
- ```
76
-
77
  """
78
 
 
 
 
 
79
  # You can find the detailed numerical results in the results Hugging Face dataset: https://huggingface.co/datasets/SeaLLMs/SeaExam-results
80
 
81
  EVALUATION_QUEUE_TEXT = """
@@ -110,6 +119,12 @@ If everything is done, check you can launch the EleutherAIHarness on your model
110
 
111
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
112
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
113
  """
114
 
115
  CONTACT_TEXT = f"""
 
16
 
17
 
18
  # Your leaderboard name
19
+ # TITLE = """<h1 align="center" id="space-title">πŸ“ƒ SeaExam and SeaBench Leaderboard</h1>"""
20
+ TITLE = """<h1 align="left" id="space-title">πŸ… LLM Leaderboard for SEA</h1>"""
21
 
22
  # subtitle
23
+ SUB_TITLE = """<h2 align="left" id="space-title">What is the best LLM for Southeast Asian Languages❓</h1>"""
24
 
25
  # What does your leaderboard evaluate?
26
  # INTRODUCTION_TEXT = """
 
35
  This leaderboard evaluates Large Language Models (LLMs) on Southeast Asian (SEA) languages through two comprehensive benchmarks: SeaExam and SeaBench. SeaExam assesses world knowledge and reasoning capabilities through exam-style questions, while SeaBench evaluates instruction-following abilities and multi-turn conversational skills. For detailed methodology and results, please refer to the "πŸ“ About" tab.
36
  """
37
 
38
+ INTRODUCTION_TEXT = """
39
+ This leaderboard evaluates Large Language Models (LLMs) on Southeast Asian (SEA) languages through two comprehensive benchmarks: SeaExam and SeaBench:
40
+ * SeaExam assesses world knowledge and reasoning capabilities through exam-style questions [[data (public)](https://huggingface.co/datasets/SeaLLMs/SeaExam)] [[code](https://github.com/DAMO-NLP-SG/SeaExam)]
41
+ * SeaBench evaluates instruction-following abilities and multi-turn conversational skills. [[data (public)](https://huggingface.co/datasets/SeaLLMs/SeaBench)] [[code](https://github.com/DAMO-NLP-SG/SeaBench?tab=readme-ov-file)]
42
+
43
+ Note: "pub" denotes public dataset, and "prv" denotes private dataset.
44
+ For more details, please refer to the "πŸ“ About" tab.
45
+ """
46
  # For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "πŸ“ About" tab.
47
 
48
  # Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
 
55
 
56
 
57
  ## Datasets
58
+ The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/datasets/SeaLLMs/SeaExam) and [SeaBench dataset](https://huggingface.co/datasets/SeaLLMs/SeaBench).
59
  - **SeaExam**: a benchmark sourced from real and official human exam questions in multiple-choice format.
60
  - **SeaBench**: a manually created benchmark for evaluating the model's ability to follow instructions and engage in multi-turn conversations. The questions are in open-ended format.
61
 
 
68
  _ **SeaBench**:
69
  We evaluate the responses of the models with GPT-4o-2024-08-06. Each response is scored on a scale of 1-10.
70
 
71
+ ## Results
72
  How to interpret the leaderboard?
73
  * Each numerical value represet the accuracy (%) for SeaExam and score for SeaBench.
74
  * The "πŸ… Overall" shows the average results across the three langauges for SeaExam public dataset (SeaExam-pub), SeaExam private dataset (SeaExam-prv), SeaBench public dataset (SeaBench-pub), (SeaBench-prv). This leaderboard is ranked by SeaExam-prv.
 
78
  * "open?" column indicates whether the model is open-source or proprietary.
79
 
80
  ## Reproducibility
81
+ To reproduce our results, use the script in [SeaExam](https://github.com/DAMO-NLP-SG/SeaExam/tree/main) and [SeaBench](https://github.com/DAMO-NLP-SG/SeaBench). The script will download the model and tokenizer, and evaluate the model on the benchmark data.
 
 
 
 
82
  """
83
 
84
+ # ```python
85
+ # python scripts/main.py --model $model_name_or_path
86
+ # ```
87
+
88
  # You can find the detailed numerical results in the results Hugging Face dataset: https://huggingface.co/datasets/SeaLLMs/SeaExam-results
89
 
90
  EVALUATION_QUEUE_TEXT = """
 
119
 
120
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
121
  CITATION_BUTTON_TEXT = r"""
122
+ @article{damonlp2024sealeaderboard,
123
+ author = {Chaoqun Liu, Wenxuan Zhang, Jiahao Ying, Mahani Aljunied, Anh Tuan Luu, Lidong Bing},
124
+ title = {SeaExam and SeaBench: Benchmarking LLMs with Local Multilingual Questions in Southeast Asia},
125
+ year = {2024},
126
+ url = {},
127
+ }
128
  """
129
 
130
  CONTACT_TEXT = f"""