Spaces:

speakleash
/

polish_eq-bench

Running

File size: 6,008 Bytes

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import csv

def create_performance_plot(csv_path='benchmark_results.csv', metadata_path='metadata.json'):
    # Define whitelist of interesting models (partial matches)
    WHITELIST = [
        'Meta-Llama-3.1-70B-Instruct'
    ]

    # Read the benchmark results with error handling for inconsistent rows
    valid_rows = []
    expected_fields = 14  # Number of expected fields in each row

    with open(csv_path, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)  # Get header row
        # Strip whitespace from header names
        header = [h.strip() for h in header]
        for row in reader:
            if len(row) == expected_fields:  # Only keep rows with correct number of fields
                # Strip whitespace from values
                valid_rows.append([val.strip() for val in row])

    # Create DataFrame from valid rows
    df = pd.DataFrame(valid_rows, columns=header)

    # Read model sizes from metadata
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)

    # Process the data
    # Keep only successful runs (where Benchmark Score is not FAILED)
    df = df[df['Benchmark Score'] != 'FAILED']
    df = df[df['Benchmark Score'].notna()]
    # Convert score to numeric, handling invalid values
    df['Benchmark Score'] = pd.to_numeric(df['Benchmark Score'], errors='coerce')
    df = df[df['Benchmark Score'].notna()]  # Remove rows where conversion failed

    # Convert Num Questions Parseable to numeric and calculate adjusted score
    df['Num Questions Parseable'] = pd.to_numeric(df['Num Questions Parseable'], errors='coerce')
    df['Benchmark Score'] = df['Benchmark Score'] * (df['Num Questions Parseable'] / 171)

    # For each model, keep only the latest run
    df['Run ID'] = df['Run ID'].fillna('')
    df['timestamp'] = pd.to_datetime(df['Benchmark Completed'])
    df = df.sort_values('timestamp')
    df = df.drop_duplicates(subset=['Model Path'], keep='last')

    # Get model sizes
    def get_model_size(model_path):
        # Try exact match first
        if model_path in metadata:
            return metadata[model_path]
        # Try with max_length suffix
        if f"{model_path},max_length=4096" in metadata:
            return metadata[f"{model_path},max_length=4096"]
        return None

    # Print models without size before filtering
    print("\nModels without size assigned:")
    models_without_size = df[df['Model Path'].apply(get_model_size).isna()]
    for model in models_without_size['Model Path']:
        print(f"- {model}")

    df['Model Size'] = df['Model Path'].apply(get_model_size)
    df = df[df['Model Size'].notna()]

    # Remove extreme outliers (scores that are clearly errors)
    q1 = df['Benchmark Score'].quantile(0.25)
    q3 = df['Benchmark Score'].quantile(0.75)
    iqr = q3 - q1
    df = df[
        (df['Benchmark Score'] >= q1 - 1.5 * iqr) & 
        (df['Benchmark Score'] <= q3 + 1.5 * iqr)
    ]

    # Find models on Pareto frontier
    sizes = sorted(df['Model Size'].unique())
    frontier_points = []
    max_score = float('-inf')
    frontier_models = set()

    for size in sizes:
        # Get scores for models of this size or smaller
        subset = df[df['Model Size'] <= size]
        if len(subset) > 0:
            max_score_idx = subset['Benchmark Score'].idxmax()
            current_max = subset.loc[max_score_idx, 'Benchmark Score']
            if current_max > max_score:
                max_score = current_max
                frontier_points.append((size, max_score))
                frontier_models.add(subset.loc[max_score_idx, 'Model Path'])

    # Filter models - keep those on Pareto frontier or matching whitelist
    df['Keep'] = False
    for idx, row in df.iterrows():
        if row['Model Path'] in frontier_models:
            df.loc[idx, 'Keep'] = True
        else:
            for pattern in WHITELIST:
                if pattern in row['Model Path']:
                    df.loc[idx, 'Keep'] = True
                    break

    df = df[df['Keep']]

    # Create the plot
    fig = plt.figure(figsize=(12, 8))

    # Create scatter plot
    plt.scatter(df['Model Size'], 
               df['Benchmark Score'],
               alpha=0.6)

    # Add labels for points
    for idx, row in df.iterrows():
        # Get model name - either last part of path or full name for special cases
        model_name = row['Model Path'].split('/')[-1]
        if any(pattern in row['Model Path'] for pattern in ['gpt-3', 'gpt-4']):
            model_name = row['Model Path']
            
        plt.annotate(model_name,
                    (row['Model Size'], row['Benchmark Score']),
                    xytext=(5, 5), textcoords='offset points',
                    fontsize=8,
                    bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5))

    # Plot the Pareto frontier line
    if frontier_points:
        frontier_x, frontier_y = zip(*frontier_points)
        plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier')

    # Add vertical line for consumer GPU budget
    plt.axvline(x=12, color='gray', linestyle=':', label='Consumer-budget GPU limit', ymin=-0.15, clip_on=False)
    plt.text(12, -0.15, 'Consumer-budget\nGPU (24GB) limit\nin full precision', 
             horizontalalignment='center', verticalalignment='top',
             transform=plt.gca().get_xaxis_transform())

    # Customize the plot
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.xlabel('Model Size (billions of parameters)')
    plt.ylabel('Benchmark Score')
    plt.title('Model Performance vs Size (Pareto Frontier)')

    # Add legend
    plt.legend()

    # Adjust layout to prevent label cutoff
    plt.tight_layout()
    
    return fig

if __name__ == "__main__":
    # When run as a script, save the plot to a file
    fig = create_performance_plot()
    fig.savefig('model_performance.png', dpi=300, bbox_inches='tight')