Spaces:
Running
Running
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import json | |
| import csv | |
| def create_performance_plot(csv_path='benchmark_results.csv', metadata_path='metadata.json'): | |
| # Define whitelist of interesting models (partial matches) | |
| WHITELIST = [ | |
| 'Meta-Llama-3.1-70B-Instruct' | |
| ] | |
| # Read the benchmark results with error handling for inconsistent rows | |
| valid_rows = [] | |
| expected_fields = 14 # Number of expected fields in each row | |
| with open(csv_path, 'r') as f: | |
| reader = csv.reader(f) | |
| header = next(reader) # Get header row | |
| # Strip whitespace from header names | |
| header = [h.strip() for h in header] | |
| for row in reader: | |
| if len(row) == expected_fields: # Only keep rows with correct number of fields | |
| # Strip whitespace from values | |
| valid_rows.append([val.strip() for val in row]) | |
| # Create DataFrame from valid rows | |
| df = pd.DataFrame(valid_rows, columns=header) | |
| # Read model sizes from metadata | |
| with open(metadata_path, 'r') as f: | |
| metadata = json.load(f) | |
| # Process the data | |
| # Keep only successful runs (where Benchmark Score is not FAILED) | |
| df = df[df['Benchmark Score'] != 'FAILED'] | |
| df = df[df['Benchmark Score'].notna()] | |
| # Convert score to numeric, handling invalid values | |
| df['Benchmark Score'] = pd.to_numeric(df['Benchmark Score'], errors='coerce') | |
| df = df[df['Benchmark Score'].notna()] # Remove rows where conversion failed | |
| # Convert Num Questions Parseable to numeric and calculate adjusted score | |
| df['Num Questions Parseable'] = pd.to_numeric(df['Num Questions Parseable'], errors='coerce') | |
| df['Benchmark Score'] = df['Benchmark Score'] * (df['Num Questions Parseable'] / 171) | |
| # For each model, keep only the latest run | |
| df['Run ID'] = df['Run ID'].fillna('') | |
| df['timestamp'] = pd.to_datetime(df['Benchmark Completed']) | |
| df = df.sort_values('timestamp') | |
| df = df.drop_duplicates(subset=['Model Path'], keep='last') | |
| # Get model sizes | |
| def get_model_size(model_path): | |
| # Try exact match first | |
| if model_path in metadata: | |
| return metadata[model_path] | |
| # Try with max_length suffix | |
| if f"{model_path},max_length=4096" in metadata: | |
| return metadata[f"{model_path},max_length=4096"] | |
| return None | |
| # Print models without size before filtering | |
| print("\nModels without size assigned:") | |
| models_without_size = df[df['Model Path'].apply(get_model_size).isna()] | |
| for model in models_without_size['Model Path']: | |
| print(f"- {model}") | |
| df['Model Size'] = df['Model Path'].apply(get_model_size) | |
| df = df[df['Model Size'].notna()] | |
| # Remove extreme outliers (scores that are clearly errors) | |
| q1 = df['Benchmark Score'].quantile(0.25) | |
| q3 = df['Benchmark Score'].quantile(0.75) | |
| iqr = q3 - q1 | |
| df = df[ | |
| (df['Benchmark Score'] >= q1 - 1.5 * iqr) & | |
| (df['Benchmark Score'] <= q3 + 1.5 * iqr) | |
| ] | |
| # Find models on Pareto frontier | |
| sizes = sorted(df['Model Size'].unique()) | |
| frontier_points = [] | |
| max_score = float('-inf') | |
| frontier_models = set() | |
| for size in sizes: | |
| # Get scores for models of this size or smaller | |
| subset = df[df['Model Size'] <= size] | |
| if len(subset) > 0: | |
| max_score_idx = subset['Benchmark Score'].idxmax() | |
| current_max = subset.loc[max_score_idx, 'Benchmark Score'] | |
| if current_max > max_score: | |
| max_score = current_max | |
| frontier_points.append((size, max_score)) | |
| frontier_models.add(subset.loc[max_score_idx, 'Model Path']) | |
| # Filter models - keep those on Pareto frontier or matching whitelist | |
| df['Keep'] = False | |
| for idx, row in df.iterrows(): | |
| if row['Model Path'] in frontier_models: | |
| df.loc[idx, 'Keep'] = True | |
| else: | |
| for pattern in WHITELIST: | |
| if pattern in row['Model Path']: | |
| df.loc[idx, 'Keep'] = True | |
| break | |
| df = df[df['Keep']] | |
| # Create the plot | |
| fig = plt.figure(figsize=(12, 8)) | |
| # Create scatter plot | |
| plt.scatter(df['Model Size'], | |
| df['Benchmark Score'], | |
| alpha=0.6) | |
| # Add labels for points | |
| for idx, row in df.iterrows(): | |
| # Get model name - either last part of path or full name for special cases | |
| model_name = row['Model Path'].split('/')[-1] | |
| if any(pattern in row['Model Path'] for pattern in ['gpt-3', 'gpt-4']): | |
| model_name = row['Model Path'] | |
| plt.annotate(model_name, | |
| (row['Model Size'], row['Benchmark Score']), | |
| xytext=(5, 5), textcoords='offset points', | |
| fontsize=8, | |
| bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5)) | |
| # Plot the Pareto frontier line | |
| if frontier_points: | |
| frontier_x, frontier_y = zip(*frontier_points) | |
| plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier') | |
| # Add vertical line for consumer GPU budget | |
| plt.axvline(x=12, color='gray', linestyle=':', label='Consumer-budget GPU limit', ymin=-0.15, clip_on=False) | |
| plt.text(12, -0.15, 'Consumer-budget\nGPU (24GB) limit\nin half precision', | |
| horizontalalignment='center', verticalalignment='top', | |
| transform=plt.gca().get_xaxis_transform()) | |
| # Customize the plot | |
| plt.grid(True, linestyle='--', alpha=0.7) | |
| plt.xlabel('Model Size (billions of parameters)') | |
| plt.ylabel('Benchmark Score') | |
| plt.title('Model Performance vs Size (Pareto Frontier)') | |
| # Add legend | |
| plt.legend() | |
| # Adjust layout to prevent label cutoff | |
| plt.tight_layout() | |
| return fig | |
| if __name__ == "__main__": | |
| # When run as a script, save the plot to a file | |
| fig = create_performance_plot() | |
| fig.savefig('model_performance.png', dpi=300, bbox_inches='tight') |