kevinpro
commited on
Commit
·
556657e
1
Parent(s):
8cc8064
commit message
Browse files- README.md +0 -12
- __pycache__/content.cpython-38.pyc +0 -0
- __pycache__/css.cpython-38.pyc +0 -0
- app.py +183 -0
- content.py +52 -0
- css.py +15 -0
README.md
CHANGED
@@ -1,12 +0,0 @@
|
|
1 |
-
---
|
2 |
-
title: Open Multilingual Reasoning Leaderboard
|
3 |
-
emoji: 💻
|
4 |
-
colorFrom: pink
|
5 |
-
colorTo: green
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.21.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__pycache__/content.cpython-38.pyc
ADDED
Binary file (1.34 kB). View file
|
|
__pycache__/css.cpython-38.pyc
ADDED
Binary file (416 Bytes). View file
|
|
app.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import glob
|
4 |
+
from collections import defaultdict
|
5 |
+
import pandas as pd
|
6 |
+
import gradio as gr
|
7 |
+
from content import *
|
8 |
+
from css import *
|
9 |
+
import glob
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
ARC = "arc"
|
14 |
+
HELLASWAG = "hellaswag"
|
15 |
+
MMLU = "mmlu"
|
16 |
+
TRUTHFULQA = "truthfulqa"
|
17 |
+
BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
|
18 |
+
|
19 |
+
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
|
20 |
+
|
21 |
+
LANGS = 'ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh'.split(',')
|
22 |
+
|
23 |
+
LANG_NAME = {
|
24 |
+
'ar': 'Arabic',
|
25 |
+
'bn': 'Bengali',
|
26 |
+
'ca': 'Catalan',
|
27 |
+
'da': 'Danish',
|
28 |
+
'de': 'German',
|
29 |
+
'es': 'Spanish',
|
30 |
+
'eu': 'Basque',
|
31 |
+
'fr': 'French',
|
32 |
+
'gu': 'Gujarati',
|
33 |
+
'hi': 'Hindi',
|
34 |
+
'hr': 'Croatian',
|
35 |
+
'hu': 'Hungarian',
|
36 |
+
'hy': 'Armenian',
|
37 |
+
'id': 'Indonesian',
|
38 |
+
'it': 'Italian',
|
39 |
+
'kn': 'Kannada',
|
40 |
+
'ml': 'Malayalam',
|
41 |
+
'mr': 'Marathi',
|
42 |
+
'ne': 'Nepali',
|
43 |
+
'nl': 'Dutch',
|
44 |
+
'pt': 'Portuguese',
|
45 |
+
'ro': 'Romanian',
|
46 |
+
'ru': 'Russian',
|
47 |
+
'sk': 'Slovak',
|
48 |
+
'sr': 'Serbian',
|
49 |
+
'sv': 'Swedish',
|
50 |
+
'ta': 'Tamil',
|
51 |
+
'te': 'Telugu',
|
52 |
+
'uk': 'Ukrainian',
|
53 |
+
'vi': 'Vietnamese',
|
54 |
+
'zh': 'Chinese'
|
55 |
+
}
|
56 |
+
|
57 |
+
MODEL_COL = "Model"
|
58 |
+
LANG_COL = "Language"
|
59 |
+
CODE_COL = "Code"
|
60 |
+
AVERAGE_COL = "Average"
|
61 |
+
ARC_COL = "ARC (25-shot)"
|
62 |
+
|
63 |
+
MGSM_COL = "MGSM"
|
64 |
+
MSVAMP_COL = "MSVAMP"
|
65 |
+
MNUM_COL = "MNumGLUESub"
|
66 |
+
HELLASWAG_COL = "HellaSwag (0-shot)️"
|
67 |
+
MMLU_COL = "MMLU (25-shot)"
|
68 |
+
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
|
69 |
+
NOTES_COL = "Notes" # For search only
|
70 |
+
|
71 |
+
COLS = [MODEL_COL, LANG_COL, CODE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL]
|
72 |
+
TYPES = ["str", "str", "str", "number", "number", "number", "number", "number", "str"]
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
COLS = [MODEL_COL, MSVAMP_COL, MGSM_COL, MNUM_COL,NOTES_COL]
|
77 |
+
TYPES = ["str", "number", "number", "number","str"]
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
def get_leaderboard_df():
|
82 |
+
df = list()
|
83 |
+
results = [
|
84 |
+
["GPT-3.5-Turbo", 46.6, 42.2, 49.4],
|
85 |
+
["MAmmoTH", 26.3, 21.3, 24.2],
|
86 |
+
["WizardMath", 32.5, 23.0, 28.7],
|
87 |
+
["MetaMath", 46.2, 37.0, 43.2],
|
88 |
+
["QAlign", 57.2, 49.6, 0],
|
89 |
+
["MathOctopus", 41.2, 39.5, 37.1],
|
90 |
+
["MathOctopus-MAPO-DPO(ours)🔥", 57.4, 41.6, 50.4],
|
91 |
+
["MetaMathOctopus", 53.0, 45.5, 39.2],
|
92 |
+
["MetaMathOctopus-MAPO-DPO(ours) 👑", 64.7, 51.6, 52.9],
|
93 |
+
["MistralMathOctopus", 59.0, 58.0, 56.8],
|
94 |
+
["MistralMathOctopus-MAPO-DPO(ours) 👑", 74.6, 67.3, 70.0]
|
95 |
+
]
|
96 |
+
# for (pretrained, lang), perfs in performance_dict.items():
|
97 |
+
# lang_name = LANG_NAME[lang]
|
98 |
+
# arc_perf = perfs.get(ARC, 0.0)
|
99 |
+
# hellaswag_perf = perfs.get(HELLASWAG, 0.0)
|
100 |
+
# mmlu_perf = perfs.get(MMLU, 0.0)
|
101 |
+
# truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
|
102 |
+
|
103 |
+
# if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
|
104 |
+
# continue
|
105 |
+
# avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
|
106 |
+
# notes = ' '.join([pretrained, lang_name])
|
107 |
+
# row = [pretrained, lang_name, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
|
108 |
+
# df.append(row)
|
109 |
+
for i in results:
|
110 |
+
i.append("NOTE")
|
111 |
+
df = pd.DataFrame.from_records(results, columns=COLS)
|
112 |
+
df = df.sort_values(by=[ MSVAMP_COL], ascending=False)
|
113 |
+
df = df[COLS]
|
114 |
+
|
115 |
+
return df
|
116 |
+
|
117 |
+
|
118 |
+
def search_table(df, query):
|
119 |
+
filtered_df = df[df[NOTES_COL].str.contains(query, case=False)]
|
120 |
+
return filtered_df
|
121 |
+
|
122 |
+
|
123 |
+
|
124 |
+
original_df = get_leaderboard_df()
|
125 |
+
|
126 |
+
demo = gr.Blocks(css=CUSTOM_CSS)
|
127 |
+
with demo:
|
128 |
+
gr.HTML(TITLE)
|
129 |
+
gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
|
130 |
+
#gr.Markdown(HOW_TO, elem_classes="markdown-text")
|
131 |
+
|
132 |
+
with gr.Box():
|
133 |
+
search_bar = gr.Textbox(
|
134 |
+
placeholder="Search models and languages...", show_label=False, elem_id="search-bar"
|
135 |
+
)
|
136 |
+
|
137 |
+
leaderboard_table = gr.components.Dataframe(
|
138 |
+
value=original_df,
|
139 |
+
headers=COLS,
|
140 |
+
datatype=TYPES,
|
141 |
+
max_rows=5,
|
142 |
+
elem_id="leaderboard-table",
|
143 |
+
)
|
144 |
+
|
145 |
+
# # Dummy leaderboard for handling the case when the user uses backspace key
|
146 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
147 |
+
value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
|
148 |
+
)
|
149 |
+
|
150 |
+
search_bar.change(
|
151 |
+
search_table,
|
152 |
+
[hidden_leaderboard_table_for_search, search_bar],
|
153 |
+
leaderboard_table,
|
154 |
+
)
|
155 |
+
|
156 |
+
with gr.Box():
|
157 |
+
search_bar = gr.Textbox(
|
158 |
+
placeholder="Search models and languages...", show_label=False, elem_id="search-bar"
|
159 |
+
)
|
160 |
+
|
161 |
+
leaderboard_table = gr.components.Dataframe(
|
162 |
+
value=original_df,
|
163 |
+
headers=COLS,
|
164 |
+
datatype=TYPES,
|
165 |
+
max_rows=5,
|
166 |
+
elem_id="leaderboard-table",
|
167 |
+
)
|
168 |
+
|
169 |
+
# # Dummy leaderboard for handling the case when the user uses backspace key
|
170 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
171 |
+
value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
|
172 |
+
)
|
173 |
+
|
174 |
+
search_bar.change(
|
175 |
+
search_table,
|
176 |
+
[hidden_leaderboard_table_for_search, search_bar],
|
177 |
+
leaderboard_table,
|
178 |
+
)
|
179 |
+
|
180 |
+
#gr.Markdown(CREDIT, elem_classes="markdown-text")
|
181 |
+
gr.Markdown(CITATION, elem_classes="markdown-text")
|
182 |
+
|
183 |
+
demo.launch()
|
content.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TITLE = '<h1 align="center" id="space-title">Open Multilingual Reasoning Leaderboard</h1>'
|
2 |
+
|
3 |
+
INTRO_TEXT = f"""
|
4 |
+
## About
|
5 |
+
|
6 |
+
This leaderboard tracks progress and ranks reasoning performance of large language models (LLMs) developed for different languages,
|
7 |
+
emphasizing on non-English languages to democratize benefits of LLMs to broader society.
|
8 |
+
Our current leaderboard provides evaluation data for 10 languages.
|
9 |
+
Both multilingual and language-specific LLMs are welcome in this leaderboard.
|
10 |
+
We currently evaluate models over four benchmarks:
|
11 |
+
|
12 |
+
- <a href="https://huggingface.co/datasets/Mathoctopus/MSVAMP" target="_blank"> MSVAMP </a>
|
13 |
+
- <a href="https://huggingface.co/datasets/juletxara/mgsm" target="_blank"> MGSM </a>
|
14 |
+
- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MNumGLUESub </a>
|
15 |
+
|
16 |
+
# """
|
17 |
+
|
18 |
+
# HOW_TO = f"""
|
19 |
+
# ## How to list your model performance on this leaderboard:
|
20 |
+
|
21 |
+
# Run the evaluation of your model using this repo: <a href="https://github.com/nlp-uoregon/mlmm-evaluation" target="_blank">https://github.com/nlp-uoregon/mlmm-evaluation</a>.
|
22 |
+
|
23 |
+
# And then, push the evaluation log and make a pull request.
|
24 |
+
# """
|
25 |
+
|
26 |
+
# CREDIT = f"""
|
27 |
+
# ## Credit
|
28 |
+
|
29 |
+
# To make this website, we use the following resources:
|
30 |
+
|
31 |
+
# - Datasets (AI2_ARC, HellaSwag, MMLU, TruthfulQA)
|
32 |
+
# - Funding and GPU access (Adobe Research)
|
33 |
+
# - Evaluation code (EleutherAI's lm_evaluation_harness repo)
|
34 |
+
# - Leaderboard code (Huggingface4's open_llm_leaderboard repo)
|
35 |
+
|
36 |
+
# """
|
37 |
+
|
38 |
+
|
39 |
+
CITATION = f"""
|
40 |
+
## Citation
|
41 |
+
|
42 |
+
```
|
43 |
+
@misc{{she2024mapo,
|
44 |
+
title={{MAPO: Advancing Multilingual Reasoning through Multilingual Alignment-as-Preference Optimization}},
|
45 |
+
author={{Shuaijie She and Wei Zou and Shujian Huang and Wenhao Zhu and Xiang Liu and Xiang Geng and Jiajun Chen}},
|
46 |
+
year={{2024}},
|
47 |
+
eprint={{2401.06838}},
|
48 |
+
archivePrefix={{arXiv}},
|
49 |
+
primaryClass={{cs.CL}}
|
50 |
+
}}
|
51 |
+
```
|
52 |
+
"""
|
css.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CUSTOM_CSS = """
|
2 |
+
/* Hides the final column */
|
3 |
+
table td:last-child,
|
4 |
+
table th:last-child {
|
5 |
+
display: none;
|
6 |
+
}
|
7 |
+
|
8 |
+
/* 控制第一列的宽度 */
|
9 |
+
table td:first-child,
|
10 |
+
table th:first-child {
|
11 |
+
max-width: 200px;
|
12 |
+
overflow: auto;
|
13 |
+
white-space: nowrap;
|
14 |
+
}
|
15 |
+
"""
|