import pandas as pd from src.display.formatting import has_no_nan_values, make_clickable_model from src.display.utils import AutoEvalColumn from src.leaderboard.read_evals import get_raw_eval_results def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame: """Creates a dataframe from all the individual experiment results""" print("\n=== Starting leaderboard creation ===", flush=True) print(f"Looking for results in: {results_path}", flush=True) print(f"Expected columns: {cols}", flush=True) print(f"Benchmark columns: {benchmark_cols}", flush=True) raw_data = get_raw_eval_results(results_path) print(f"\nFound {len(raw_data)} raw results", flush=True) all_data_json = [v.to_dict() for v in raw_data] print(f"\nConverted to {len(all_data_json)} JSON records", flush=True) if all_data_json: print("Sample record keys:", list(all_data_json[0].keys()), flush=True) if not all_data_json: print("\nNo data found, creating empty DataFrame", flush=True) empty_df = pd.DataFrame(columns=cols) # Ensure correct column types empty_df[AutoEvalColumn.average.name] = pd.Series(dtype=float) for col in benchmark_cols: empty_df[col] = pd.Series(dtype=float) return empty_df df = pd.DataFrame.from_records(all_data_json) print("\nCreated DataFrame with columns:", df.columns.tolist(), flush=True) print("DataFrame shape:", df.shape, flush=True) try: df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False) print("\nSorted DataFrame by average", flush=True) except KeyError as e: print(f"\nError sorting DataFrame: {e}", flush=True) print("Available columns:", df.columns.tolist(), flush=True) try: df = df[cols].round(decimals=2) print("\nSelected and rounded columns", flush=True) except KeyError as e: print(f"\nError selecting columns: {e}", flush=True) print("Requested columns:", cols, flush=True) print("Available columns:", df.columns.tolist(), flush=True) # Create empty DataFrame with correct structure empty_df = pd.DataFrame(columns=cols) empty_df[AutoEvalColumn.average.name] = pd.Series(dtype=float) for col in benchmark_cols: empty_df[col] = pd.Series(dtype=float) return empty_df # filter out if perplexity hasn't been evaluated df = df[has_no_nan_values(df, benchmark_cols)] print("\nFinal DataFrame shape after filtering:", df.shape, flush=True) print("Final columns:", df.columns.tolist(), flush=True) return df