Spaces:

PKU-Alignment
/

EvalAnything-LeaderBoard

Running

App Files Files Community

XuyaoWang commited on Dec 15, 2024

Commit

0c1fd76

1 Parent(s): 0474b44

wip

Browse files

Files changed (4) hide show

app.py +17 -37
community_results/.gitkeep +0 -0
data/eval_board.csv +8 -0
src/about.py +30 -0

app.py CHANGED Viewed

@@ -15,46 +15,19 @@ from src.about import (
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
-    ABOUT_TEXT
 )
 from src.display.css_html_js import custom_css
 from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
-# 定义模型性能数据和链接
-model_links = {
-    "LLaVA-v1.5-7B†": "https://huggingface.co/liuhaotian/llava-v1.5-7b",
-    "Qwen2-VL-7B-Instruct†": "https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct",
-    "Qwen2-Audio-7B-Instruct†": "https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct",
-    "Chameleon-7B†": "https://huggingface.co/facebook/chameleon-7b",
-    "Llama3.1-8B-Instruct†": "https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct",
-    "Gemini-1.5-Pro†": "https://deepmind.google/technologies/gemini/pro/",
-    "GPT-4o†": "https://openai.com/index/hello-gpt-4o/"
-}
-data = {
-    "Model": list(model_links.keys()),
-    "Perception": [2.66, 2.76, 3.58, 1.44, 1.05, 5.36, 2.66],
-    "Reasoning": [2.67, 3.07, 4.53, 2.97, 1.20, 5.67, 3.48],
-    "IF": [2.50, 2.40, 3.40, 2.80, 1.20, 6.70, 4.20],
-    "Safety": [2.90, 4.05, 2.65, 2.45, 1.35, 6.70, 5.15],
-    "AMU Score": [2.68, 3.07, 3.54, 2.41, 1.20, 6.11, 3.87],
-    "Modality Selection": [0.182, 0.177, 0.190, 0.156, 0.231, 0.227, 0.266],
-    "Instruction Following": [6.61, 7.01, 6.69, 6.09, 7.47, 8.62, 8.62],
-    "Modality Synergy": [0.43, 0.58, 0.51, 0.54, 0.60, 0.52, 0.58],
-    "AMG Score": [1.56, 2.16, 1.97, 1.57, 3.08, 3.05, 3.96],
-    "Overall": [2.12, 2.62, 2.73, 1.99, 2.14, 4.58, 3.92]
-}
-df = pd.DataFrame(data).sort_values(by='Overall', ascending=False)
-total_models = len(df)
 # 定义列组
 COLUMN_GROUPS = {
     "ALL": ["Model", "Perception", "Reasoning", "IF", "Safety", "AMU Score",
             "Modality Selection", "Instruction Following", "Modality Synergy",
-            "AMG Score", "Overall"],
-    "AMU": ["Model", "Perception", "Reasoning", "IF", "Safety", "AMU Score"],
-    "AMG": ["Model", "Modality Selection", "Instruction Following", "Modality Synergy", "AMG Score"]
 }
 def format_table(df):
@@ -70,6 +43,7 @@ def format_table(df):
             df[col] = df[col].apply(lambda x: f'**{x}**')
     # 添加模型链接
     # df['Model'] = df['Model'].apply(lambda x: f'<a href="{model_links[x]}" target="_blank">{x}</a>')
     df['Model'] = df['Model'].apply(lambda x: f'[{x}]({model_links[x]})')
     # df['Model'] = df.apply(lambda x: model_hyperlink(model_links[x['Model']], x['Model']), axis=1)
@@ -95,14 +69,18 @@ def regex_table(dataframe, regex, filter_button, column_group="ALL"):
     df = df.sort_values(by='Overall' if 'Overall' in columns_to_show else columns_to_show[-1], ascending=False)
     df.reset_index(drop=True, inplace=True)
-    # Format numbers and add links
-    df = format_table(df)
     # Add index column
     df.insert(0, '', range(1, 1 + len(df)))
     return df
 with gr.Blocks(css=custom_css) as app:
     gr.HTML(TITLE)
     with gr.Row():
@@ -142,8 +120,10 @@ with gr.Blocks(css=custom_css) as app:
                 )
         with gr.TabItem("About"):
-            with gr.Row():
-                gr.Markdown(ABOUT_TEXT)
     with gr.Accordion("📚 Citation", open=False):
         citation_button = gr.Textbox(

     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
+    ABOUT_TEXT,
+    SUBMISSION_TEXT
 )
 from src.display.css_html_js import custom_css
 from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
 # 定义列组
 COLUMN_GROUPS = {
     "ALL": ["Model", "Perception", "Reasoning", "IF", "Safety", "AMU Score",
             "Modality Selection", "Instruction Following", "Modality Synergy",
+            "AMG Score", "Overall", "Verified"],
+    "AMU": ["Model", "Perception", "Reasoning", "IF", "Safety", "AMU Score", "Verified"],
+    "AMG": ["Model", "Modality Selection", "Instruction Following", "Modality Synergy", "AMG Score", "Verified"]
 }
 def format_table(df):
             df[col] = df[col].apply(lambda x: f'**{x}**')
     # 添加模型链接
+    model_links = dict(zip(df['Model'], df['Model Link']))
     # df['Model'] = df['Model'].apply(lambda x: f'<a href="{model_links[x]}" target="_blank">{x}</a>')
     df['Model'] = df['Model'].apply(lambda x: f'[{x}]({model_links[x]})')
     # df['Model'] = df.apply(lambda x: model_hyperlink(model_links[x['Model']], x['Model']), axis=1)
     df = df.sort_values(by='Overall' if 'Overall' in columns_to_show else columns_to_show[-1], ascending=False)
     df.reset_index(drop=True, inplace=True)
     # Add index column
     df.insert(0, '', range(1, 1 + len(df)))
     return df
+df = pd.read_csv("data/eval_board.csv").sort_values(by='Overall', ascending=False)
+total_models = len(df)
+# Format numbers and add links
+df = format_table(df)
 with gr.Blocks(css=custom_css) as app:
     gr.HTML(TITLE)
     with gr.Row():
                 )
         with gr.TabItem("About"):
+            gr.Markdown(ABOUT_TEXT)
+        with gr.TabItem("Submit results 🚀", id=3):
+            gr.Markdown(SUBMISSION_TEXT)
     with gr.Accordion("📚 Citation", open=False):
         citation_button = gr.Textbox(

community_results/.gitkeep ADDED Viewed

File without changes

data/eval_board.csv ADDED Viewed

	@@ -0,0 +1,8 @@

+Model,Perception,Reasoning,IF,Safety,AMU Score,Modality Selection,Instruction Following,Modality Synergy,AMG Score,Overall,Verified,Model Link
+LLaVA-v1.5-7B†,2.66,2.67,2.5,2.9,2.68,0.182,6.61,0.43,1.56,2.12,Yes,https://huggingface.co/liuhaotian/llava-v1.5-7b
+Qwen2-VL-7B-Instruct†,2.76,3.07,2.4,4.05,3.07,0.177,7.01,0.58,2.16,2.62,Yes,https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
+Qwen2-Audio-7B-Instruct†,3.58,4.53,3.4,2.65,3.54,0.19,6.69,0.51,1.97,2.73,Yes,https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct
+Chameleon-7B†,1.44,2.97,2.8,2.45,2.41,0.156,6.09,0.54,1.57,1.99,Yes,https://huggingface.co/facebook/chameleon-7b
+Llama3.1-8B-Instruct†,1.05,1.2,1.2,1.35,1.2,0.231,7.47,0.6,3.08,2.14,Yes,https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
+Gemini-1.5-Pro†,5.36,5.67,6.7,6.7,6.11,0.227,8.62,0.52,3.05,4.58,Yes,https://deepmind.google/technologies/gemini/pro/
+GPT-4o†,2.66,3.48,4.2,5.15,3.87,0.266,8.62,0.58,3.96,3.92,Yes,https://openai.com/index/hello-gpt-4o/

src/about.py CHANGED Viewed

@@ -54,4 +54,34 @@ CITATION_BUTTON_TEXT = """
 ABOUT_TEXT = """
 We will provide methods to upload more model evaluation results in the future.
 """

 ABOUT_TEXT = """
 We will provide methods to upload more model evaluation results in the future.
+"""
+SUBMISSION_TEXT = """
+<h1 align="center">
+How to submit models/results to the leaderboard?
+</h1>
+We welcome the community to submit evaluation results for new models. These results will be added as non-verified. However, the authors are required to upload their generations in case other members want to verify the results.
+### 1 - Running Evaluation 🏃‍♂️
+We have written a detailed guide for running the evaluation on your model. You can find it in the `[align-anything](https://github.com/PKU-Alignment/align-anything/tree/main/align_anything/evaluation/benchmarks/leaderboard)`. This process will generate a JSON file and a zip file summarizing the results, along with the raw generations and metric files.
+### 2 - Submitting Results 🚀
+To submit your results create a **Pull Request** in the community tab to add them under the [folder](hhttps://huggingface.co/spaces/PKU-Alignment/EvalAnything-LeaderBoard/tree/main/community_results) `community_results` in this repository:
+- Create a folder named `ORG_MODELNAME_USERNAME`. For example `PKU-Alignment_gemini1.5-pro_XuyaoWang`
+- Place your JSON file and ZIP file with grouped scores from the guide, along with the generations folder and metrics folder, inside this newly created folder.
+The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
+### 3 - Getting your model verified ✅
+A verified result in Eval-Anything indicates that a core maintainer has decoded the outputs from the model and performed the evaluation. To have your model verified, please follow these steps:
+1. Email us and provide a brief rationale for why your model should be verified.
+2. Await our response and approval before proceeding.
+3. Prepare a script to decode from your model that does not require a GPU. Typically, this should be the same script used for your model contribution. It should run without requiring a local GPU. It should run without requiring a local GPU. We strongly recommend that you modify the scripts in [align-anything](https://github.com/PKU-Alignment/align-anything/tree/main/align_anything/evaluation/benchmarks/leaderboard) to adapt to your model's operation.
+4. Generate temporary OpenAI API keys for running the script and share them with us. Specifically, we need the keys for evaluation.
+5. We will check and execute your script, update the results, and inform you so that you can revoke the temporary keys.
+**Please note that we will not re-evaluate the same model. Due to sampling variance, the results might slightly differ from your initial ones. We will replace your previous community results with the verified ones.**
 """