File size: 2,667 Bytes
359f755
 
77c0f20
359f755
 
77c0f20
359f755
ce8066d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24c8512
 
ce8066d
 
 
 
24c8512
ce8066d
 
 
 
 
 
 
 
24c8512
 
 
 
 
ce8066d
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import pandas as pd
from src.display.formatting import has_no_nan_values, make_clickable_model
from src.display.utils import AutoEvalColumn
from src.leaderboard.read_evals import get_raw_eval_results

def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
    """Creates a dataframe from all the individual experiment results"""
    print("\n=== Starting leaderboard creation ===", flush=True)
    print(f"Looking for results in: {results_path}", flush=True)
    print(f"Expected columns: {cols}", flush=True)
    print(f"Benchmark columns: {benchmark_cols}", flush=True)

    raw_data = get_raw_eval_results(results_path)
    print(f"\nFound {len(raw_data)} raw results", flush=True)

    all_data_json = [v.to_dict() for v in raw_data]
    print(f"\nConverted to {len(all_data_json)} JSON records", flush=True)
    if all_data_json:
        print("Sample record keys:", list(all_data_json[0].keys()), flush=True)

    if not all_data_json:
        print("\nNo data found, creating empty DataFrame", flush=True)
        empty_df = pd.DataFrame(columns=cols)
        # Ensure correct column types
        empty_df[AutoEvalColumn.average.name] = pd.Series(dtype=float)
        for col in benchmark_cols:
            empty_df[col] = pd.Series(dtype=float)
        return empty_df

    df = pd.DataFrame.from_records(all_data_json)
    print("\nCreated DataFrame with columns:", df.columns.tolist(), flush=True)
    print("DataFrame shape:", df.shape, flush=True)

    try:
        df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
        print("\nSorted DataFrame by average", flush=True)
    except KeyError as e:
        print(f"\nError sorting DataFrame: {e}", flush=True)
        print("Available columns:", df.columns.tolist(), flush=True)

    try:
        df = df[cols].round(decimals=2)
        print("\nSelected and rounded columns", flush=True)
    except KeyError as e:
        print(f"\nError selecting columns: {e}", flush=True)
        print("Requested columns:", cols, flush=True)
        print("Available columns:", df.columns.tolist(), flush=True)
        # Create empty DataFrame with correct structure
        empty_df = pd.DataFrame(columns=cols)
        empty_df[AutoEvalColumn.average.name] = pd.Series(dtype=float)
        for col in benchmark_cols:
            empty_df[col] = pd.Series(dtype=float)
        return empty_df

    # filter out if perplexity hasn't been evaluated
    df = df[has_no_nan_values(df, benchmark_cols)]
    print("\nFinal DataFrame shape after filtering:", df.shape, flush=True)
    print("Final columns:", df.columns.tolist(), flush=True)
    
    return df