polish_eq-bench / plot_results.py
djstrong's picture
Fix GPU precision label in performance plot
288816d
raw
history blame
6.01 kB
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import csv
def create_performance_plot(csv_path='benchmark_results.csv', metadata_path='metadata.json'):
# Define whitelist of interesting models (partial matches)
WHITELIST = [
'Meta-Llama-3.1-70B-Instruct'
]
# Read the benchmark results with error handling for inconsistent rows
valid_rows = []
expected_fields = 14 # Number of expected fields in each row
with open(csv_path, 'r') as f:
reader = csv.reader(f)
header = next(reader) # Get header row
# Strip whitespace from header names
header = [h.strip() for h in header]
for row in reader:
if len(row) == expected_fields: # Only keep rows with correct number of fields
# Strip whitespace from values
valid_rows.append([val.strip() for val in row])
# Create DataFrame from valid rows
df = pd.DataFrame(valid_rows, columns=header)
# Read model sizes from metadata
with open(metadata_path, 'r') as f:
metadata = json.load(f)
# Process the data
# Keep only successful runs (where Benchmark Score is not FAILED)
df = df[df['Benchmark Score'] != 'FAILED']
df = df[df['Benchmark Score'].notna()]
# Convert score to numeric, handling invalid values
df['Benchmark Score'] = pd.to_numeric(df['Benchmark Score'], errors='coerce')
df = df[df['Benchmark Score'].notna()] # Remove rows where conversion failed
# Convert Num Questions Parseable to numeric and calculate adjusted score
df['Num Questions Parseable'] = pd.to_numeric(df['Num Questions Parseable'], errors='coerce')
df['Benchmark Score'] = df['Benchmark Score'] * (df['Num Questions Parseable'] / 171)
# For each model, keep only the latest run
df['Run ID'] = df['Run ID'].fillna('')
df['timestamp'] = pd.to_datetime(df['Benchmark Completed'])
df = df.sort_values('timestamp')
df = df.drop_duplicates(subset=['Model Path'], keep='last')
# Get model sizes
def get_model_size(model_path):
# Try exact match first
if model_path in metadata:
return metadata[model_path]
# Try with max_length suffix
if f"{model_path},max_length=4096" in metadata:
return metadata[f"{model_path},max_length=4096"]
return None
# Print models without size before filtering
print("\nModels without size assigned:")
models_without_size = df[df['Model Path'].apply(get_model_size).isna()]
for model in models_without_size['Model Path']:
print(f"- {model}")
df['Model Size'] = df['Model Path'].apply(get_model_size)
df = df[df['Model Size'].notna()]
# Remove extreme outliers (scores that are clearly errors)
q1 = df['Benchmark Score'].quantile(0.25)
q3 = df['Benchmark Score'].quantile(0.75)
iqr = q3 - q1
df = df[
(df['Benchmark Score'] >= q1 - 1.5 * iqr) &
(df['Benchmark Score'] <= q3 + 1.5 * iqr)
]
# Find models on Pareto frontier
sizes = sorted(df['Model Size'].unique())
frontier_points = []
max_score = float('-inf')
frontier_models = set()
for size in sizes:
# Get scores for models of this size or smaller
subset = df[df['Model Size'] <= size]
if len(subset) > 0:
max_score_idx = subset['Benchmark Score'].idxmax()
current_max = subset.loc[max_score_idx, 'Benchmark Score']
if current_max > max_score:
max_score = current_max
frontier_points.append((size, max_score))
frontier_models.add(subset.loc[max_score_idx, 'Model Path'])
# Filter models - keep those on Pareto frontier or matching whitelist
df['Keep'] = False
for idx, row in df.iterrows():
if row['Model Path'] in frontier_models:
df.loc[idx, 'Keep'] = True
else:
for pattern in WHITELIST:
if pattern in row['Model Path']:
df.loc[idx, 'Keep'] = True
break
df = df[df['Keep']]
# Create the plot
fig = plt.figure(figsize=(12, 8))
# Create scatter plot
plt.scatter(df['Model Size'],
df['Benchmark Score'],
alpha=0.6)
# Add labels for points
for idx, row in df.iterrows():
# Get model name - either last part of path or full name for special cases
model_name = row['Model Path'].split('/')[-1]
if any(pattern in row['Model Path'] for pattern in ['gpt-3', 'gpt-4']):
model_name = row['Model Path']
plt.annotate(model_name,
(row['Model Size'], row['Benchmark Score']),
xytext=(5, 5), textcoords='offset points',
fontsize=8,
bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5))
# Plot the Pareto frontier line
if frontier_points:
frontier_x, frontier_y = zip(*frontier_points)
plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier')
# Add vertical line for consumer GPU budget
plt.axvline(x=12, color='gray', linestyle=':', label='Consumer-budget GPU limit', ymin=-0.15, clip_on=False)
plt.text(12, -0.15, 'Consumer-budget\nGPU (24GB) limit\nin half precision',
horizontalalignment='center', verticalalignment='top',
transform=plt.gca().get_xaxis_transform())
# Customize the plot
plt.grid(True, linestyle='--', alpha=0.7)
plt.xlabel('Model Size (billions of parameters)')
plt.ylabel('Benchmark Score')
plt.title('Model Performance vs Size (Pareto Frontier)')
# Add legend
plt.legend()
# Adjust layout to prevent label cutoff
plt.tight_layout()
return fig
if __name__ == "__main__":
# When run as a script, save the plot to a file
fig = create_performance_plot()
fig.savefig('model_performance.png', dpi=300, bbox_inches='tight')