XuyaoWang commited on
Commit
0c1fd76
·
1 Parent(s): 0474b44
Files changed (4) hide show
  1. app.py +17 -37
  2. community_results/.gitkeep +0 -0
  3. data/eval_board.csv +8 -0
  4. src/about.py +30 -0
app.py CHANGED
@@ -15,46 +15,19 @@ from src.about import (
15
  INTRODUCTION_TEXT,
16
  LLM_BENCHMARKS_TEXT,
17
  TITLE,
18
- ABOUT_TEXT
 
19
  )
20
  from src.display.css_html_js import custom_css
21
  from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
22
 
23
- # 定义模型性能数据和链接
24
- model_links = {
25
- "LLaVA-v1.5-7B†": "https://huggingface.co/liuhaotian/llava-v1.5-7b",
26
- "Qwen2-VL-7B-Instruct†": "https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct",
27
- "Qwen2-Audio-7B-Instruct†": "https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct",
28
- "Chameleon-7B†": "https://huggingface.co/facebook/chameleon-7b",
29
- "Llama3.1-8B-Instruct†": "https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct",
30
- "Gemini-1.5-Pro†": "https://deepmind.google/technologies/gemini/pro/",
31
- "GPT-4o†": "https://openai.com/index/hello-gpt-4o/"
32
- }
33
-
34
- data = {
35
- "Model": list(model_links.keys()),
36
- "Perception": [2.66, 2.76, 3.58, 1.44, 1.05, 5.36, 2.66],
37
- "Reasoning": [2.67, 3.07, 4.53, 2.97, 1.20, 5.67, 3.48],
38
- "IF": [2.50, 2.40, 3.40, 2.80, 1.20, 6.70, 4.20],
39
- "Safety": [2.90, 4.05, 2.65, 2.45, 1.35, 6.70, 5.15],
40
- "AMU Score": [2.68, 3.07, 3.54, 2.41, 1.20, 6.11, 3.87],
41
- "Modality Selection": [0.182, 0.177, 0.190, 0.156, 0.231, 0.227, 0.266],
42
- "Instruction Following": [6.61, 7.01, 6.69, 6.09, 7.47, 8.62, 8.62],
43
- "Modality Synergy": [0.43, 0.58, 0.51, 0.54, 0.60, 0.52, 0.58],
44
- "AMG Score": [1.56, 2.16, 1.97, 1.57, 3.08, 3.05, 3.96],
45
- "Overall": [2.12, 2.62, 2.73, 1.99, 2.14, 4.58, 3.92]
46
- }
47
-
48
- df = pd.DataFrame(data).sort_values(by='Overall', ascending=False)
49
- total_models = len(df)
50
-
51
  # 定义列组
52
  COLUMN_GROUPS = {
53
  "ALL": ["Model", "Perception", "Reasoning", "IF", "Safety", "AMU Score",
54
  "Modality Selection", "Instruction Following", "Modality Synergy",
55
- "AMG Score", "Overall"],
56
- "AMU": ["Model", "Perception", "Reasoning", "IF", "Safety", "AMU Score"],
57
- "AMG": ["Model", "Modality Selection", "Instruction Following", "Modality Synergy", "AMG Score"]
58
  }
59
 
60
  def format_table(df):
@@ -70,6 +43,7 @@ def format_table(df):
70
  df[col] = df[col].apply(lambda x: f'**{x}**')
71
 
72
  # 添加模型链接
 
73
  # df['Model'] = df['Model'].apply(lambda x: f'<a href="{model_links[x]}" target="_blank">{x}</a>')
74
  df['Model'] = df['Model'].apply(lambda x: f'[{x}]({model_links[x]})')
75
  # df['Model'] = df.apply(lambda x: model_hyperlink(model_links[x['Model']], x['Model']), axis=1)
@@ -95,14 +69,18 @@ def regex_table(dataframe, regex, filter_button, column_group="ALL"):
95
  df = df.sort_values(by='Overall' if 'Overall' in columns_to_show else columns_to_show[-1], ascending=False)
96
  df.reset_index(drop=True, inplace=True)
97
 
98
- # Format numbers and add links
99
- df = format_table(df)
100
-
101
  # Add index column
102
  df.insert(0, '', range(1, 1 + len(df)))
103
 
104
  return df
105
 
 
 
 
 
 
 
 
106
  with gr.Blocks(css=custom_css) as app:
107
  gr.HTML(TITLE)
108
  with gr.Row():
@@ -142,8 +120,10 @@ with gr.Blocks(css=custom_css) as app:
142
  )
143
 
144
  with gr.TabItem("About"):
145
- with gr.Row():
146
- gr.Markdown(ABOUT_TEXT)
 
 
147
 
148
  with gr.Accordion("📚 Citation", open=False):
149
  citation_button = gr.Textbox(
 
15
  INTRODUCTION_TEXT,
16
  LLM_BENCHMARKS_TEXT,
17
  TITLE,
18
+ ABOUT_TEXT,
19
+ SUBMISSION_TEXT
20
  )
21
  from src.display.css_html_js import custom_css
22
  from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  # 定义列组
25
  COLUMN_GROUPS = {
26
  "ALL": ["Model", "Perception", "Reasoning", "IF", "Safety", "AMU Score",
27
  "Modality Selection", "Instruction Following", "Modality Synergy",
28
+ "AMG Score", "Overall", "Verified"],
29
+ "AMU": ["Model", "Perception", "Reasoning", "IF", "Safety", "AMU Score", "Verified"],
30
+ "AMG": ["Model", "Modality Selection", "Instruction Following", "Modality Synergy", "AMG Score", "Verified"]
31
  }
32
 
33
  def format_table(df):
 
43
  df[col] = df[col].apply(lambda x: f'**{x}**')
44
 
45
  # 添加模型链接
46
+ model_links = dict(zip(df['Model'], df['Model Link']))
47
  # df['Model'] = df['Model'].apply(lambda x: f'<a href="{model_links[x]}" target="_blank">{x}</a>')
48
  df['Model'] = df['Model'].apply(lambda x: f'[{x}]({model_links[x]})')
49
  # df['Model'] = df.apply(lambda x: model_hyperlink(model_links[x['Model']], x['Model']), axis=1)
 
69
  df = df.sort_values(by='Overall' if 'Overall' in columns_to_show else columns_to_show[-1], ascending=False)
70
  df.reset_index(drop=True, inplace=True)
71
 
 
 
 
72
  # Add index column
73
  df.insert(0, '', range(1, 1 + len(df)))
74
 
75
  return df
76
 
77
+
78
+ df = pd.read_csv("data/eval_board.csv").sort_values(by='Overall', ascending=False)
79
+ total_models = len(df)
80
+
81
+ # Format numbers and add links
82
+ df = format_table(df)
83
+
84
  with gr.Blocks(css=custom_css) as app:
85
  gr.HTML(TITLE)
86
  with gr.Row():
 
120
  )
121
 
122
  with gr.TabItem("About"):
123
+ gr.Markdown(ABOUT_TEXT)
124
+
125
+ with gr.TabItem("Submit results 🚀", id=3):
126
+ gr.Markdown(SUBMISSION_TEXT)
127
 
128
  with gr.Accordion("📚 Citation", open=False):
129
  citation_button = gr.Textbox(
community_results/.gitkeep ADDED
File without changes
data/eval_board.csv ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Model,Perception,Reasoning,IF,Safety,AMU Score,Modality Selection,Instruction Following,Modality Synergy,AMG Score,Overall,Verified,Model Link
2
+ LLaVA-v1.5-7B†,2.66,2.67,2.5,2.9,2.68,0.182,6.61,0.43,1.56,2.12,Yes,https://huggingface.co/liuhaotian/llava-v1.5-7b
3
+ Qwen2-VL-7B-Instruct†,2.76,3.07,2.4,4.05,3.07,0.177,7.01,0.58,2.16,2.62,Yes,https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
4
+ Qwen2-Audio-7B-Instruct†,3.58,4.53,3.4,2.65,3.54,0.19,6.69,0.51,1.97,2.73,Yes,https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct
5
+ Chameleon-7B†,1.44,2.97,2.8,2.45,2.41,0.156,6.09,0.54,1.57,1.99,Yes,https://huggingface.co/facebook/chameleon-7b
6
+ Llama3.1-8B-Instruct†,1.05,1.2,1.2,1.35,1.2,0.231,7.47,0.6,3.08,2.14,Yes,https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
7
+ Gemini-1.5-Pro†,5.36,5.67,6.7,6.7,6.11,0.227,8.62,0.52,3.05,4.58,Yes,https://deepmind.google/technologies/gemini/pro/
8
+ GPT-4o†,2.66,3.48,4.2,5.15,3.87,0.266,8.62,0.58,3.96,3.92,Yes,https://openai.com/index/hello-gpt-4o/
src/about.py CHANGED
@@ -54,4 +54,34 @@ CITATION_BUTTON_TEXT = """
54
 
55
  ABOUT_TEXT = """
56
  We will provide methods to upload more model evaluation results in the future.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  """
 
54
 
55
  ABOUT_TEXT = """
56
  We will provide methods to upload more model evaluation results in the future.
57
+ """
58
+
59
+ SUBMISSION_TEXT = """
60
+ <h1 align="center">
61
+ How to submit models/results to the leaderboard?
62
+ </h1>
63
+ We welcome the community to submit evaluation results for new models. These results will be added as non-verified. However, the authors are required to upload their generations in case other members want to verify the results.
64
+
65
+ ### 1 - Running Evaluation 🏃‍♂️
66
+
67
+ We have written a detailed guide for running the evaluation on your model. You can find it in the `[align-anything](https://github.com/PKU-Alignment/align-anything/tree/main/align_anything/evaluation/benchmarks/leaderboard)`. This process will generate a JSON file and a zip file summarizing the results, along with the raw generations and metric files.
68
+
69
+ ### 2 - Submitting Results 🚀
70
+
71
+ To submit your results create a **Pull Request** in the community tab to add them under the [folder](hhttps://huggingface.co/spaces/PKU-Alignment/EvalAnything-LeaderBoard/tree/main/community_results) `community_results` in this repository:
72
+ - Create a folder named `ORG_MODELNAME_USERNAME`. For example `PKU-Alignment_gemini1.5-pro_XuyaoWang`
73
+ - Place your JSON file and ZIP file with grouped scores from the guide, along with the generations folder and metrics folder, inside this newly created folder.
74
+
75
+ The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
76
+
77
+ ### 3 - Getting your model verified ✅
78
+ A verified result in Eval-Anything indicates that a core maintainer has decoded the outputs from the model and performed the evaluation. To have your model verified, please follow these steps:
79
+
80
+ 1. Email us and provide a brief rationale for why your model should be verified.
81
+ 2. Await our response and approval before proceeding.
82
+ 3. Prepare a script to decode from your model that does not require a GPU. Typically, this should be the same script used for your model contribution. It should run without requiring a local GPU. It should run without requiring a local GPU. We strongly recommend that you modify the scripts in [align-anything](https://github.com/PKU-Alignment/align-anything/tree/main/align_anything/evaluation/benchmarks/leaderboard) to adapt to your model's operation.
83
+ 4. Generate temporary OpenAI API keys for running the script and share them with us. Specifically, we need the keys for evaluation.
84
+ 5. We will check and execute your script, update the results, and inform you so that you can revoke the temporary keys.
85
+
86
+ **Please note that we will not re-evaluate the same model. Due to sampling variance, the results might slightly differ from your initial ones. We will replace your previous community results with the verified ones.**
87
  """