wip
Browse files- app.py +17 -37
- community_results/.gitkeep +0 -0
- data/eval_board.csv +8 -0
- src/about.py +30 -0
app.py
CHANGED
@@ -15,46 +15,19 @@ from src.about import (
|
|
15 |
INTRODUCTION_TEXT,
|
16 |
LLM_BENCHMARKS_TEXT,
|
17 |
TITLE,
|
18 |
-
ABOUT_TEXT
|
|
|
19 |
)
|
20 |
from src.display.css_html_js import custom_css
|
21 |
from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
|
22 |
|
23 |
-
# 定义模型性能数据和链接
|
24 |
-
model_links = {
|
25 |
-
"LLaVA-v1.5-7B†": "https://huggingface.co/liuhaotian/llava-v1.5-7b",
|
26 |
-
"Qwen2-VL-7B-Instruct†": "https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct",
|
27 |
-
"Qwen2-Audio-7B-Instruct†": "https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct",
|
28 |
-
"Chameleon-7B†": "https://huggingface.co/facebook/chameleon-7b",
|
29 |
-
"Llama3.1-8B-Instruct†": "https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct",
|
30 |
-
"Gemini-1.5-Pro†": "https://deepmind.google/technologies/gemini/pro/",
|
31 |
-
"GPT-4o†": "https://openai.com/index/hello-gpt-4o/"
|
32 |
-
}
|
33 |
-
|
34 |
-
data = {
|
35 |
-
"Model": list(model_links.keys()),
|
36 |
-
"Perception": [2.66, 2.76, 3.58, 1.44, 1.05, 5.36, 2.66],
|
37 |
-
"Reasoning": [2.67, 3.07, 4.53, 2.97, 1.20, 5.67, 3.48],
|
38 |
-
"IF": [2.50, 2.40, 3.40, 2.80, 1.20, 6.70, 4.20],
|
39 |
-
"Safety": [2.90, 4.05, 2.65, 2.45, 1.35, 6.70, 5.15],
|
40 |
-
"AMU Score": [2.68, 3.07, 3.54, 2.41, 1.20, 6.11, 3.87],
|
41 |
-
"Modality Selection": [0.182, 0.177, 0.190, 0.156, 0.231, 0.227, 0.266],
|
42 |
-
"Instruction Following": [6.61, 7.01, 6.69, 6.09, 7.47, 8.62, 8.62],
|
43 |
-
"Modality Synergy": [0.43, 0.58, 0.51, 0.54, 0.60, 0.52, 0.58],
|
44 |
-
"AMG Score": [1.56, 2.16, 1.97, 1.57, 3.08, 3.05, 3.96],
|
45 |
-
"Overall": [2.12, 2.62, 2.73, 1.99, 2.14, 4.58, 3.92]
|
46 |
-
}
|
47 |
-
|
48 |
-
df = pd.DataFrame(data).sort_values(by='Overall', ascending=False)
|
49 |
-
total_models = len(df)
|
50 |
-
|
51 |
# 定义列组
|
52 |
COLUMN_GROUPS = {
|
53 |
"ALL": ["Model", "Perception", "Reasoning", "IF", "Safety", "AMU Score",
|
54 |
"Modality Selection", "Instruction Following", "Modality Synergy",
|
55 |
-
"AMG Score", "Overall"],
|
56 |
-
"AMU": ["Model", "Perception", "Reasoning", "IF", "Safety", "AMU Score"],
|
57 |
-
"AMG": ["Model", "Modality Selection", "Instruction Following", "Modality Synergy", "AMG Score"]
|
58 |
}
|
59 |
|
60 |
def format_table(df):
|
@@ -70,6 +43,7 @@ def format_table(df):
|
|
70 |
df[col] = df[col].apply(lambda x: f'**{x}**')
|
71 |
|
72 |
# 添加模型链接
|
|
|
73 |
# df['Model'] = df['Model'].apply(lambda x: f'<a href="{model_links[x]}" target="_blank">{x}</a>')
|
74 |
df['Model'] = df['Model'].apply(lambda x: f'[{x}]({model_links[x]})')
|
75 |
# df['Model'] = df.apply(lambda x: model_hyperlink(model_links[x['Model']], x['Model']), axis=1)
|
@@ -95,14 +69,18 @@ def regex_table(dataframe, regex, filter_button, column_group="ALL"):
|
|
95 |
df = df.sort_values(by='Overall' if 'Overall' in columns_to_show else columns_to_show[-1], ascending=False)
|
96 |
df.reset_index(drop=True, inplace=True)
|
97 |
|
98 |
-
# Format numbers and add links
|
99 |
-
df = format_table(df)
|
100 |
-
|
101 |
# Add index column
|
102 |
df.insert(0, '', range(1, 1 + len(df)))
|
103 |
|
104 |
return df
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
with gr.Blocks(css=custom_css) as app:
|
107 |
gr.HTML(TITLE)
|
108 |
with gr.Row():
|
@@ -142,8 +120,10 @@ with gr.Blocks(css=custom_css) as app:
|
|
142 |
)
|
143 |
|
144 |
with gr.TabItem("About"):
|
145 |
-
|
146 |
-
|
|
|
|
|
147 |
|
148 |
with gr.Accordion("📚 Citation", open=False):
|
149 |
citation_button = gr.Textbox(
|
|
|
15 |
INTRODUCTION_TEXT,
|
16 |
LLM_BENCHMARKS_TEXT,
|
17 |
TITLE,
|
18 |
+
ABOUT_TEXT,
|
19 |
+
SUBMISSION_TEXT
|
20 |
)
|
21 |
from src.display.css_html_js import custom_css
|
22 |
from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
# 定义列组
|
25 |
COLUMN_GROUPS = {
|
26 |
"ALL": ["Model", "Perception", "Reasoning", "IF", "Safety", "AMU Score",
|
27 |
"Modality Selection", "Instruction Following", "Modality Synergy",
|
28 |
+
"AMG Score", "Overall", "Verified"],
|
29 |
+
"AMU": ["Model", "Perception", "Reasoning", "IF", "Safety", "AMU Score", "Verified"],
|
30 |
+
"AMG": ["Model", "Modality Selection", "Instruction Following", "Modality Synergy", "AMG Score", "Verified"]
|
31 |
}
|
32 |
|
33 |
def format_table(df):
|
|
|
43 |
df[col] = df[col].apply(lambda x: f'**{x}**')
|
44 |
|
45 |
# 添加模型链接
|
46 |
+
model_links = dict(zip(df['Model'], df['Model Link']))
|
47 |
# df['Model'] = df['Model'].apply(lambda x: f'<a href="{model_links[x]}" target="_blank">{x}</a>')
|
48 |
df['Model'] = df['Model'].apply(lambda x: f'[{x}]({model_links[x]})')
|
49 |
# df['Model'] = df.apply(lambda x: model_hyperlink(model_links[x['Model']], x['Model']), axis=1)
|
|
|
69 |
df = df.sort_values(by='Overall' if 'Overall' in columns_to_show else columns_to_show[-1], ascending=False)
|
70 |
df.reset_index(drop=True, inplace=True)
|
71 |
|
|
|
|
|
|
|
72 |
# Add index column
|
73 |
df.insert(0, '', range(1, 1 + len(df)))
|
74 |
|
75 |
return df
|
76 |
|
77 |
+
|
78 |
+
df = pd.read_csv("data/eval_board.csv").sort_values(by='Overall', ascending=False)
|
79 |
+
total_models = len(df)
|
80 |
+
|
81 |
+
# Format numbers and add links
|
82 |
+
df = format_table(df)
|
83 |
+
|
84 |
with gr.Blocks(css=custom_css) as app:
|
85 |
gr.HTML(TITLE)
|
86 |
with gr.Row():
|
|
|
120 |
)
|
121 |
|
122 |
with gr.TabItem("About"):
|
123 |
+
gr.Markdown(ABOUT_TEXT)
|
124 |
+
|
125 |
+
with gr.TabItem("Submit results 🚀", id=3):
|
126 |
+
gr.Markdown(SUBMISSION_TEXT)
|
127 |
|
128 |
with gr.Accordion("📚 Citation", open=False):
|
129 |
citation_button = gr.Textbox(
|
community_results/.gitkeep
ADDED
File without changes
|
data/eval_board.csv
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Perception,Reasoning,IF,Safety,AMU Score,Modality Selection,Instruction Following,Modality Synergy,AMG Score,Overall,Verified,Model Link
|
2 |
+
LLaVA-v1.5-7B†,2.66,2.67,2.5,2.9,2.68,0.182,6.61,0.43,1.56,2.12,Yes,https://huggingface.co/liuhaotian/llava-v1.5-7b
|
3 |
+
Qwen2-VL-7B-Instruct†,2.76,3.07,2.4,4.05,3.07,0.177,7.01,0.58,2.16,2.62,Yes,https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
|
4 |
+
Qwen2-Audio-7B-Instruct†,3.58,4.53,3.4,2.65,3.54,0.19,6.69,0.51,1.97,2.73,Yes,https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct
|
5 |
+
Chameleon-7B†,1.44,2.97,2.8,2.45,2.41,0.156,6.09,0.54,1.57,1.99,Yes,https://huggingface.co/facebook/chameleon-7b
|
6 |
+
Llama3.1-8B-Instruct†,1.05,1.2,1.2,1.35,1.2,0.231,7.47,0.6,3.08,2.14,Yes,https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
|
7 |
+
Gemini-1.5-Pro†,5.36,5.67,6.7,6.7,6.11,0.227,8.62,0.52,3.05,4.58,Yes,https://deepmind.google/technologies/gemini/pro/
|
8 |
+
GPT-4o†,2.66,3.48,4.2,5.15,3.87,0.266,8.62,0.58,3.96,3.92,Yes,https://openai.com/index/hello-gpt-4o/
|
src/about.py
CHANGED
@@ -54,4 +54,34 @@ CITATION_BUTTON_TEXT = """
|
|
54 |
|
55 |
ABOUT_TEXT = """
|
56 |
We will provide methods to upload more model evaluation results in the future.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
"""
|
|
|
54 |
|
55 |
ABOUT_TEXT = """
|
56 |
We will provide methods to upload more model evaluation results in the future.
|
57 |
+
"""
|
58 |
+
|
59 |
+
SUBMISSION_TEXT = """
|
60 |
+
<h1 align="center">
|
61 |
+
How to submit models/results to the leaderboard?
|
62 |
+
</h1>
|
63 |
+
We welcome the community to submit evaluation results for new models. These results will be added as non-verified. However, the authors are required to upload their generations in case other members want to verify the results.
|
64 |
+
|
65 |
+
### 1 - Running Evaluation 🏃♂️
|
66 |
+
|
67 |
+
We have written a detailed guide for running the evaluation on your model. You can find it in the `[align-anything](https://github.com/PKU-Alignment/align-anything/tree/main/align_anything/evaluation/benchmarks/leaderboard)`. This process will generate a JSON file and a zip file summarizing the results, along with the raw generations and metric files.
|
68 |
+
|
69 |
+
### 2 - Submitting Results 🚀
|
70 |
+
|
71 |
+
To submit your results create a **Pull Request** in the community tab to add them under the [folder](hhttps://huggingface.co/spaces/PKU-Alignment/EvalAnything-LeaderBoard/tree/main/community_results) `community_results` in this repository:
|
72 |
+
- Create a folder named `ORG_MODELNAME_USERNAME`. For example `PKU-Alignment_gemini1.5-pro_XuyaoWang`
|
73 |
+
- Place your JSON file and ZIP file with grouped scores from the guide, along with the generations folder and metrics folder, inside this newly created folder.
|
74 |
+
|
75 |
+
The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
|
76 |
+
|
77 |
+
### 3 - Getting your model verified ✅
|
78 |
+
A verified result in Eval-Anything indicates that a core maintainer has decoded the outputs from the model and performed the evaluation. To have your model verified, please follow these steps:
|
79 |
+
|
80 |
+
1. Email us and provide a brief rationale for why your model should be verified.
|
81 |
+
2. Await our response and approval before proceeding.
|
82 |
+
3. Prepare a script to decode from your model that does not require a GPU. Typically, this should be the same script used for your model contribution. It should run without requiring a local GPU. It should run without requiring a local GPU. We strongly recommend that you modify the scripts in [align-anything](https://github.com/PKU-Alignment/align-anything/tree/main/align_anything/evaluation/benchmarks/leaderboard) to adapt to your model's operation.
|
83 |
+
4. Generate temporary OpenAI API keys for running the script and share them with us. Specifically, we need the keys for evaluation.
|
84 |
+
5. We will check and execute your script, update the results, and inform you so that you can revoke the temporary keys.
|
85 |
+
|
86 |
+
**Please note that we will not re-evaluate the same model. Due to sampling variance, the results might slightly differ from your initial ones. We will replace your previous community results with the verified ones.**
|
87 |
"""
|