Spaces:

speakleash
/

polish_eq-bench

Running

App Files Files Community

djstrong commited on Mar 3

Commit

7a9f32a

1 Parent(s): 235501b

update

Browse files

Files changed (4) hide show

app.py +14 -3
metadata.json +29 -1
plot_results.py +153 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ from src.about import (
     AUTHORS,
 )
 from src.display.formatting import make_clickable_model
 demo = gr.Blocks(css=custom_css)
 with demo:
@@ -96,9 +97,16 @@ with demo:
     # rename columns
     leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model"})
     leaderboard_df = leaderboard_df.rename(columns={"Num Questions Parseable": "Percentage Questions Parseable"})
-    leaderboard_df_styled = leaderboard_df.style.background_gradient(cmap="RdYlGn")
-    leaderboard_df_styled = leaderboard_df_styled.background_gradient(cmap="RdYlGn_r", subset=['Params'])
     rounding = {}
     # for col in ["Benchmark Score", "Num Questions Parseable"]:
@@ -110,13 +118,16 @@ with demo:
     leaderboard_table = gr.components.Dataframe(
         value=leaderboard_df_styled,
-        # headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
         datatype=['markdown', 'number', 'number', 'number', 'str'],
         elem_id="leaderboard-table",
         interactive=False,
         visible=True,
     )
     gr.Markdown(AUTHORS, elem_classes="markdown-text")
     demo.queue(default_concurrency_limit=40).launch()

     AUTHORS,
 )
 from src.display.formatting import make_clickable_model
+from plot_results import create_performance_plot
 demo = gr.Blocks(css=custom_css)
 with demo:
     # rename columns
     leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model"})
     leaderboard_df = leaderboard_df.rename(columns={"Num Questions Parseable": "Percentage Questions Parseable"})
+    # Set midpoint for gradient coloring based on data ranges
+    leaderboard_df_styled = leaderboard_df.style.background_gradient(
+        cmap="RdYlGn"
+    )
+    leaderboard_df_styled = leaderboard_df_styled.background_gradient(
+        cmap="RdYlGn_r",
+        subset=['Params'],
+        vmax=150
+    )
     rounding = {}
     # for col in ["Benchmark Score", "Num Questions Parseable"]:
     leaderboard_table = gr.components.Dataframe(
         value=leaderboard_df_styled,
         datatype=['markdown', 'number', 'number', 'number', 'str'],
         elem_id="leaderboard-table",
         interactive=False,
         visible=True,
     )
+    # Create and show the performance plot below the table
+    fig = create_performance_plot()
+    plot = gr.Plot(value=fig, elem_id="performance-plot")
     gr.Markdown(AUTHORS, elem_classes="markdown-text")
     demo.queue(default_concurrency_limit=40).launch()

metadata.json CHANGED Viewed

@@ -319,5 +319,33 @@
   "speakleash/Bielik-11B-v2.0-Instruct": 11,
   "speakleash/Bielik-11B-v2.2-Instruct": 11,
   "speakleash/Bielik-11B-v2.1-Instruct": 11,
-  "speakleash/Bielik-11B-v2.3-Instruct": 11
 }

   "speakleash/Bielik-11B-v2.0-Instruct": 11,
   "speakleash/Bielik-11B-v2.2-Instruct": 11,
   "speakleash/Bielik-11B-v2.1-Instruct": 11,
+  "speakleash/Bielik-11B-v2.3-Instruct": 11,
+  "CYFRAGOVPL/PLLuM-12B-nc-chat": 12,
+  "CYFRAGOVPL/PLLuM-12B-chat": 12,
+  "CYFRAGOVPL/PLLuM-12B-instruct": 12,
+  "CYFRAGOVPL/Llama-PLLuM-8B-instruct": 8,
+  "CYFRAGOVPL/PLLuM-12B-nc-instruct": 12,
+  "CYFRAGOVPL/Llama-PLLuM-8B-chat": 8,
+  "CYFRAGOVPL/PLLuM-8x7B-nc-chat": 46.7,
+  "CYFRAGOVPL/PLLuM-8x7B-nc-instruct": 46.7,
+  "CYFRAGOVPL/PLLuM-8x7B-chat": 46.7,
+  "CYFRAGOVPL/PLLuM-8x7B-instruct": 46.7,
+  "CYFRAGOVPL/Llama-PLLuM-70B-chat": 70,
+  "CYFRAGOVPL/Llama-PLLuM-70B-instruct": 70,
+  "Qwen/Qwen2.5-7B-Instruct": 7,
+  "Qwen/Qwen2.5-14B-Instruct": 14,
+  "Qwen/Qwen2.5-1.5B-Instruct": 1.5,
+  "microsoft/phi-4": 14.7,
+  "Qwen/Qwen2.5-32B-Instruct": 32,
+  "Qwen/Qwen2.5-72B-Instruct": 72,
+  "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF": 70,
+  "meta-llama/Llama-3.2-1B-Instruct": 1,
+  "utter-project/EuroLLM-9B-Instruct": 9,
+  "mistralai/Mistral-Small-Instruct-2409": 22.2,
+  "mistralai/Mistral-Small-24B-Instruct-2501": 24,
+  "meta-llama/Llama-3.3-70B-Instruct": 70,
+  "meta-llama/Llama-3.2-3B-Instruct": 3,
+  "Qwen/Qwen2.5-3B-Instruct": 3,
+  "mistralai/Mistral-Nemo-Instruct-2407": 12,
+  "microsoft/Phi-4-mini-instruct": 4
 }

plot_results.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import json
+import csv
+def create_performance_plot(csv_path='benchmark_results.csv', metadata_path='metadata.json'):
+    # Define whitelist of interesting models (partial matches)
+    WHITELIST = [
+        'Meta-Llama-3.1-70B-Instruct'
+    ]
+    # Read the benchmark results with error handling for inconsistent rows
+    valid_rows = []
+    expected_fields = 14  # Number of expected fields in each row
+    with open(csv_path, 'r') as f:
+        reader = csv.reader(f)
+        header = next(reader)  # Get header row
+        # Strip whitespace from header names
+        header = [h.strip() for h in header]
+        for row in reader:
+            if len(row) == expected_fields:  # Only keep rows with correct number of fields
+                # Strip whitespace from values
+                valid_rows.append([val.strip() for val in row])
+    # Create DataFrame from valid rows
+    df = pd.DataFrame(valid_rows, columns=header)
+    # Read model sizes from metadata
+    with open(metadata_path, 'r') as f:
+        metadata = json.load(f)
+    # Process the data
+    # Keep only successful runs (where Benchmark Score is not FAILED)
+    df = df[df['Benchmark Score'] != 'FAILED']
+    df = df[df['Benchmark Score'].notna()]
+    # Convert score to numeric, handling invalid values
+    df['Benchmark Score'] = pd.to_numeric(df['Benchmark Score'], errors='coerce')
+    df = df[df['Benchmark Score'].notna()]  # Remove rows where conversion failed
+    # Convert Num Questions Parseable to numeric and calculate adjusted score
+    df['Num Questions Parseable'] = pd.to_numeric(df['Num Questions Parseable'], errors='coerce')
+    df['Benchmark Score'] = df['Benchmark Score'] * (df['Num Questions Parseable'] / 171)
+    # For each model, keep only the latest run
+    df['Run ID'] = df['Run ID'].fillna('')
+    df['timestamp'] = pd.to_datetime(df['Benchmark Completed'])
+    df = df.sort_values('timestamp')
+    df = df.drop_duplicates(subset=['Model Path'], keep='last')
+    # Get model sizes
+    def get_model_size(model_path):
+        # Try exact match first
+        if model_path in metadata:
+            return metadata[model_path]
+        # Try with max_length suffix
+        if f"{model_path},max_length=4096" in metadata:
+            return metadata[f"{model_path},max_length=4096"]
+        return None
+    # Print models without size before filtering
+    print("\nModels without size assigned:")
+    models_without_size = df[df['Model Path'].apply(get_model_size).isna()]
+    for model in models_without_size['Model Path']:
+        print(f"- {model}")
+    df['Model Size'] = df['Model Path'].apply(get_model_size)
+    df = df[df['Model Size'].notna()]
+    # Remove extreme outliers (scores that are clearly errors)
+    q1 = df['Benchmark Score'].quantile(0.25)
+    q3 = df['Benchmark Score'].quantile(0.75)
+    iqr = q3 - q1
+    df = df[
+        (df['Benchmark Score'] >= q1 - 1.5 * iqr) &
+        (df['Benchmark Score'] <= q3 + 1.5 * iqr)
+    ]
+    # Find models on Pareto frontier
+    sizes = sorted(df['Model Size'].unique())
+    frontier_points = []
+    max_score = float('-inf')
+    frontier_models = set()
+    for size in sizes:
+        # Get scores for models of this size or smaller
+        subset = df[df['Model Size'] <= size]
+        if len(subset) > 0:
+            max_score_idx = subset['Benchmark Score'].idxmax()
+            current_max = subset.loc[max_score_idx, 'Benchmark Score']
+            if current_max > max_score:
+                max_score = current_max
+                frontier_points.append((size, max_score))
+                frontier_models.add(subset.loc[max_score_idx, 'Model Path'])
+    # Filter models - keep those on Pareto frontier or matching whitelist
+    df['Keep'] = False
+    for idx, row in df.iterrows():
+        if row['Model Path'] in frontier_models:
+            df.loc[idx, 'Keep'] = True
+        else:
+            for pattern in WHITELIST:
+                if pattern in row['Model Path']:
+                    df.loc[idx, 'Keep'] = True
+                    break
+    df = df[df['Keep']]
+    # Create the plot
+    fig = plt.figure(figsize=(12, 8))
+    # Create scatter plot
+    plt.scatter(df['Model Size'],
+               df['Benchmark Score'],
+               alpha=0.6)
+    # Add labels for points
+    for idx, row in df.iterrows():
+        # Get model name - either last part of path or full name for special cases
+        model_name = row['Model Path'].split('/')[-1]
+        if any(pattern in row['Model Path'] for pattern in ['gpt-3', 'gpt-4']):
+            model_name = row['Model Path']
+        plt.annotate(model_name,
+                    (row['Model Size'], row['Benchmark Score']),
+                    xytext=(5, 5), textcoords='offset points',
+                    fontsize=8,
+                    bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5))
+    # Plot the Pareto frontier line
+    if frontier_points:
+        frontier_x, frontier_y = zip(*frontier_points)
+        plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier')
+    # Customize the plot
+    plt.grid(True, linestyle='--', alpha=0.7)
+    plt.xlabel('Model Size (billions of parameters)')
+    plt.ylabel('Benchmark Score')
+    plt.title('Model Performance vs Size (Pareto Frontier)')
+    # Add legend
+    plt.legend()
+    # Adjust layout to prevent label cutoff
+    plt.tight_layout()
+    return fig
+if __name__ == "__main__":
+    # When run as a script, save the plot to a file
+    fig = create_performance_plot()
+    fig.savefig('model_performance.png', dpi=300, bbox_inches='tight')

requirements.txt CHANGED Viewed

@@ -2,3 +2,4 @@ tqdm
 gradio
 gradio_client
 pandas

 gradio
 gradio_client
 pandas
+matplotlib