File size: 6,008 Bytes
7a9f32a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f820897
 
 
 
 
 
7a9f32a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import csv

def create_performance_plot(csv_path='benchmark_results.csv', metadata_path='metadata.json'):
    # Define whitelist of interesting models (partial matches)
    WHITELIST = [
        'Meta-Llama-3.1-70B-Instruct'
    ]

    # Read the benchmark results with error handling for inconsistent rows
    valid_rows = []
    expected_fields = 14  # Number of expected fields in each row

    with open(csv_path, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)  # Get header row
        # Strip whitespace from header names
        header = [h.strip() for h in header]
        for row in reader:
            if len(row) == expected_fields:  # Only keep rows with correct number of fields
                # Strip whitespace from values
                valid_rows.append([val.strip() for val in row])

    # Create DataFrame from valid rows
    df = pd.DataFrame(valid_rows, columns=header)

    # Read model sizes from metadata
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)

    # Process the data
    # Keep only successful runs (where Benchmark Score is not FAILED)
    df = df[df['Benchmark Score'] != 'FAILED']
    df = df[df['Benchmark Score'].notna()]
    # Convert score to numeric, handling invalid values
    df['Benchmark Score'] = pd.to_numeric(df['Benchmark Score'], errors='coerce')
    df = df[df['Benchmark Score'].notna()]  # Remove rows where conversion failed

    # Convert Num Questions Parseable to numeric and calculate adjusted score
    df['Num Questions Parseable'] = pd.to_numeric(df['Num Questions Parseable'], errors='coerce')
    df['Benchmark Score'] = df['Benchmark Score'] * (df['Num Questions Parseable'] / 171)

    # For each model, keep only the latest run
    df['Run ID'] = df['Run ID'].fillna('')
    df['timestamp'] = pd.to_datetime(df['Benchmark Completed'])
    df = df.sort_values('timestamp')
    df = df.drop_duplicates(subset=['Model Path'], keep='last')

    # Get model sizes
    def get_model_size(model_path):
        # Try exact match first
        if model_path in metadata:
            return metadata[model_path]
        # Try with max_length suffix
        if f"{model_path},max_length=4096" in metadata:
            return metadata[f"{model_path},max_length=4096"]
        return None

    # Print models without size before filtering
    print("\nModels without size assigned:")
    models_without_size = df[df['Model Path'].apply(get_model_size).isna()]
    for model in models_without_size['Model Path']:
        print(f"- {model}")

    df['Model Size'] = df['Model Path'].apply(get_model_size)
    df = df[df['Model Size'].notna()]

    # Remove extreme outliers (scores that are clearly errors)
    q1 = df['Benchmark Score'].quantile(0.25)
    q3 = df['Benchmark Score'].quantile(0.75)
    iqr = q3 - q1
    df = df[
        (df['Benchmark Score'] >= q1 - 1.5 * iqr) & 
        (df['Benchmark Score'] <= q3 + 1.5 * iqr)
    ]

    # Find models on Pareto frontier
    sizes = sorted(df['Model Size'].unique())
    frontier_points = []
    max_score = float('-inf')
    frontier_models = set()

    for size in sizes:
        # Get scores for models of this size or smaller
        subset = df[df['Model Size'] <= size]
        if len(subset) > 0:
            max_score_idx = subset['Benchmark Score'].idxmax()
            current_max = subset.loc[max_score_idx, 'Benchmark Score']
            if current_max > max_score:
                max_score = current_max
                frontier_points.append((size, max_score))
                frontier_models.add(subset.loc[max_score_idx, 'Model Path'])

    # Filter models - keep those on Pareto frontier or matching whitelist
    df['Keep'] = False
    for idx, row in df.iterrows():
        if row['Model Path'] in frontier_models:
            df.loc[idx, 'Keep'] = True
        else:
            for pattern in WHITELIST:
                if pattern in row['Model Path']:
                    df.loc[idx, 'Keep'] = True
                    break

    df = df[df['Keep']]

    # Create the plot
    fig = plt.figure(figsize=(12, 8))

    # Create scatter plot
    plt.scatter(df['Model Size'], 
               df['Benchmark Score'],
               alpha=0.6)

    # Add labels for points
    for idx, row in df.iterrows():
        # Get model name - either last part of path or full name for special cases
        model_name = row['Model Path'].split('/')[-1]
        if any(pattern in row['Model Path'] for pattern in ['gpt-3', 'gpt-4']):
            model_name = row['Model Path']
            
        plt.annotate(model_name,
                    (row['Model Size'], row['Benchmark Score']),
                    xytext=(5, 5), textcoords='offset points',
                    fontsize=8,
                    bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5))

    # Plot the Pareto frontier line
    if frontier_points:
        frontier_x, frontier_y = zip(*frontier_points)
        plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier')

    # Add vertical line for consumer GPU budget
    plt.axvline(x=12, color='gray', linestyle=':', label='Consumer-budget GPU limit', ymin=-0.15, clip_on=False)
    plt.text(12, -0.15, 'Consumer-budget\nGPU (24GB) limit\nin full precision', 
             horizontalalignment='center', verticalalignment='top',
             transform=plt.gca().get_xaxis_transform())

    # Customize the plot
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.xlabel('Model Size (billions of parameters)')
    plt.ylabel('Benchmark Score')
    plt.title('Model Performance vs Size (Pareto Frontier)')

    # Add legend
    plt.legend()

    # Adjust layout to prevent label cutoff
    plt.tight_layout()
    
    return fig

if __name__ == "__main__":
    # When run as a script, save the plot to a file
    fig = create_performance_plot()
    fig.savefig('model_performance.png', dpi=300, bbox_inches='tight')