Ahmed Ahmed commited on
Commit
ce8066d
·
1 Parent(s): c99a049

consolidate

Browse files
Files changed (3) hide show
  1. app.py +18 -1
  2. src/leaderboard/read_evals.py +21 -3
  3. src/populate.py +45 -20
app.py CHANGED
@@ -25,6 +25,10 @@ def init_leaderboard(dataframe):
25
  if dataframe is None:
26
  raise ValueError("Leaderboard DataFrame is None.")
27
 
 
 
 
 
28
  return Leaderboard(
29
  value=dataframe,
30
  select_columns=[c.name for c in fields(AutoEvalColumn) if not c.hidden],
@@ -38,8 +42,10 @@ def init_leaderboard(dataframe):
38
 
39
  def refresh_leaderboard():
40
  """Refresh leaderboard data from disk"""
 
41
  try:
42
  # Download latest results
 
43
  snapshot_download(
44
  repo_id=RESULTS_REPO,
45
  local_dir=EVAL_RESULTS_PATH,
@@ -48,23 +54,34 @@ def refresh_leaderboard():
48
  etag_timeout=30,
49
  token=TOKEN
50
  )
 
51
  except Exception as e:
52
- print(f"Error refreshing results: {e}")
53
 
54
  # Get fresh leaderboard data
 
55
  df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
 
56
  return init_leaderboard(df)
57
 
58
  def run_perplexity_test(model_name, revision, precision):
59
  """Run perplexity evaluation on demand."""
 
 
 
 
 
60
  if not model_name:
61
  return "Please enter a model name.", None
62
 
63
  success, result = run_dynamic_perplexity_eval(model_name, revision, precision)
 
64
 
65
  if success:
66
  # Get updated leaderboard
 
67
  new_leaderboard = refresh_leaderboard()
 
68
  return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}", new_leaderboard
69
  else:
70
  return f"❌ Evaluation failed: {result}", None
 
25
  if dataframe is None:
26
  raise ValueError("Leaderboard DataFrame is None.")
27
 
28
+ print("\n=== Initializing Leaderboard ===", flush=True)
29
+ print(f"DataFrame shape: {dataframe.shape}", flush=True)
30
+ print(f"DataFrame columns: {dataframe.columns.tolist()}", flush=True)
31
+
32
  return Leaderboard(
33
  value=dataframe,
34
  select_columns=[c.name for c in fields(AutoEvalColumn) if not c.hidden],
 
42
 
43
  def refresh_leaderboard():
44
  """Refresh leaderboard data from disk"""
45
+ print("\n=== Refreshing Leaderboard ===", flush=True)
46
  try:
47
  # Download latest results
48
+ print("Downloading latest results...", flush=True)
49
  snapshot_download(
50
  repo_id=RESULTS_REPO,
51
  local_dir=EVAL_RESULTS_PATH,
 
54
  etag_timeout=30,
55
  token=TOKEN
56
  )
57
+ print("Download complete", flush=True)
58
  except Exception as e:
59
+ print(f"Error refreshing results: {e}", flush=True)
60
 
61
  # Get fresh leaderboard data
62
+ print("Getting fresh leaderboard data...", flush=True)
63
  df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
64
+ print(f"Got DataFrame with shape: {df.shape}", flush=True)
65
  return init_leaderboard(df)
66
 
67
  def run_perplexity_test(model_name, revision, precision):
68
  """Run perplexity evaluation on demand."""
69
+ print(f"\n=== Running Perplexity Test ===", flush=True)
70
+ print(f"Model: {model_name}", flush=True)
71
+ print(f"Revision: {revision}", flush=True)
72
+ print(f"Precision: {precision}", flush=True)
73
+
74
  if not model_name:
75
  return "Please enter a model name.", None
76
 
77
  success, result = run_dynamic_perplexity_eval(model_name, revision, precision)
78
+ print(f"Evaluation result - Success: {success}, Result: {result}", flush=True)
79
 
80
  if success:
81
  # Get updated leaderboard
82
+ print("Refreshing leaderboard...", flush=True)
83
  new_leaderboard = refresh_leaderboard()
84
+ print("Leaderboard refresh complete", flush=True)
85
  return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}", new_leaderboard
86
  else:
87
  return f"❌ Evaluation failed: {result}", None
src/leaderboard/read_evals.py CHANGED
@@ -76,6 +76,9 @@ class EvalResult:
76
 
77
  def to_dict(self):
78
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
 
 
79
  # Calculate average, handling perplexity (lower is better)
80
  scores = []
81
  perplexity_score = None
@@ -90,6 +93,7 @@ class EvalResult:
90
  scores.append(score)
91
 
92
  average = sum(scores) / len(scores) if scores else 0
 
93
 
94
  data_dict = {
95
  "eval_name": self.eval_name, # not a column, just a save name,
@@ -111,13 +115,17 @@ class EvalResult:
111
  # Add perplexity score with the exact column name from Tasks
112
  if perplexity_score is not None:
113
  data_dict[Tasks.task0.value.col_name] = perplexity_score
 
114
  else:
115
  data_dict[Tasks.task0.value.col_name] = None
 
116
 
 
117
  return data_dict
118
 
119
  def get_raw_eval_results(results_path: str) -> list[EvalResult]:
120
  """From the path of the results folder root, extract all perplexity results"""
 
121
  model_result_filepaths = []
122
 
123
  for root, _, files in os.walk(results_path):
@@ -128,29 +136,39 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
128
  for file in files:
129
  model_result_filepaths.append(os.path.join(root, file))
130
 
 
 
131
  eval_results = {}
132
  for model_result_filepath in model_result_filepaths:
133
  try:
 
134
  # Creation of result
135
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
 
136
 
137
  # Store results of same eval together
138
  eval_name = eval_result.eval_name
139
  if eval_name in eval_results.keys():
140
  eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
 
141
  else:
142
  eval_results[eval_name] = eval_result
 
143
  except Exception as e:
144
- print(f"Error processing result file {model_result_filepath}: {e}")
145
  continue
146
 
147
  results = []
 
148
  for v in eval_results.values():
149
  try:
 
150
  v.to_dict() # we test if the dict version is complete
151
  results.append(v)
152
- except KeyError as e: # not all eval values present
153
- print(f"Error converting result to dict: {e}")
 
154
  continue
155
 
 
156
  return results
 
76
 
77
  def to_dict(self):
78
  """Converts the Eval Result to a dict compatible with our dataframe display"""
79
+ print(f"\nProcessing result for model: {self.full_model}", flush=True)
80
+ print(f"Raw results: {self.results}", flush=True)
81
+
82
  # Calculate average, handling perplexity (lower is better)
83
  scores = []
84
  perplexity_score = None
 
93
  scores.append(score)
94
 
95
  average = sum(scores) / len(scores) if scores else 0
96
+ print(f"Calculated average score: {average}", flush=True)
97
 
98
  data_dict = {
99
  "eval_name": self.eval_name, # not a column, just a save name,
 
115
  # Add perplexity score with the exact column name from Tasks
116
  if perplexity_score is not None:
117
  data_dict[Tasks.task0.value.col_name] = perplexity_score
118
+ print(f"Added perplexity score {perplexity_score} under column {Tasks.task0.value.col_name}", flush=True)
119
  else:
120
  data_dict[Tasks.task0.value.col_name] = None
121
+ print(f"No perplexity score found for column {Tasks.task0.value.col_name}", flush=True)
122
 
123
+ print(f"Final data dict keys: {list(data_dict.keys())}", flush=True)
124
  return data_dict
125
 
126
  def get_raw_eval_results(results_path: str) -> list[EvalResult]:
127
  """From the path of the results folder root, extract all perplexity results"""
128
+ print(f"\nSearching for result files in: {results_path}", flush=True)
129
  model_result_filepaths = []
130
 
131
  for root, _, files in os.walk(results_path):
 
136
  for file in files:
137
  model_result_filepaths.append(os.path.join(root, file))
138
 
139
+ print(f"Found {len(model_result_filepaths)} result files", flush=True)
140
+
141
  eval_results = {}
142
  for model_result_filepath in model_result_filepaths:
143
  try:
144
+ print(f"\nProcessing file: {model_result_filepath}", flush=True)
145
  # Creation of result
146
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
147
+ print(f"Created result object for: {eval_result.full_model}", flush=True)
148
 
149
  # Store results of same eval together
150
  eval_name = eval_result.eval_name
151
  if eval_name in eval_results.keys():
152
  eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
153
+ print(f"Updated existing result for {eval_name}", flush=True)
154
  else:
155
  eval_results[eval_name] = eval_result
156
+ print(f"Added new result for {eval_name}", flush=True)
157
  except Exception as e:
158
+ print(f"Error processing result file {model_result_filepath}: {e}", flush=True)
159
  continue
160
 
161
  results = []
162
+ print(f"\nProcessing {len(eval_results)} evaluation results", flush=True)
163
  for v in eval_results.values():
164
  try:
165
+ print(f"\nConverting result to dict for: {v.full_model}", flush=True)
166
  v.to_dict() # we test if the dict version is complete
167
  results.append(v)
168
+ print("Successfully converted and added result", flush=True)
169
+ except KeyError as e:
170
+ print(f"Error converting result to dict: {e}", flush=True)
171
  continue
172
 
173
+ print(f"\nReturning {len(results)} processed results", flush=True)
174
  return results
src/populate.py CHANGED
@@ -5,31 +5,56 @@ from src.leaderboard.read_evals import get_raw_eval_results
5
 
6
  def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
7
  """Creates a dataframe from all the individual experiment results"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  try:
9
- raw_data = get_raw_eval_results(results_path)
10
- all_data_json = [v.to_dict() for v in raw_data]
11
-
12
- if not all_data_json:
13
- # Create empty DataFrame with correct columns
14
- empty_df = pd.DataFrame(columns=cols)
15
- # Ensure correct column types
16
- empty_df[AutoEvalColumn.average.name] = pd.Series(dtype=float)
17
- for col in benchmark_cols:
18
- empty_df[col] = pd.Series(dtype=float)
19
- return empty_df
20
-
21
- df = pd.DataFrame.from_records(all_data_json)
22
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
23
- df = df[cols].round(decimals=2)
 
 
 
24
 
25
- # filter out if perplexity hasn't been evaluated
26
- df = df[has_no_nan_values(df, benchmark_cols)]
27
- return df
28
- except Exception as e:
29
- print(f"Error creating leaderboard: {e}")
30
- # Return empty DataFrame with correct structure
 
 
31
  empty_df = pd.DataFrame(columns=cols)
32
  empty_df[AutoEvalColumn.average.name] = pd.Series(dtype=float)
33
  for col in benchmark_cols:
34
  empty_df[col] = pd.Series(dtype=float)
35
  return empty_df
 
 
 
 
 
 
 
 
5
 
6
  def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
7
  """Creates a dataframe from all the individual experiment results"""
8
+ print("\n=== Starting leaderboard creation ===", flush=True)
9
+ print(f"Looking for results in: {results_path}", flush=True)
10
+ print(f"Expected columns: {cols}", flush=True)
11
+ print(f"Benchmark columns: {benchmark_cols}", flush=True)
12
+
13
+ raw_data = get_raw_eval_results(results_path)
14
+ print(f"\nFound {len(raw_data)} raw results", flush=True)
15
+
16
+ all_data_json = [v.to_dict() for v in raw_data]
17
+ print(f"\nConverted to {len(all_data_json)} JSON records", flush=True)
18
+ if all_data_json:
19
+ print("Sample record keys:", list(all_data_json[0].keys()), flush=True)
20
+
21
+ if not all_data_json:
22
+ print("\nNo data found, creating empty DataFrame", flush=True)
23
+ empty_df = pd.DataFrame(columns=cols)
24
+ # Ensure correct column types
25
+ empty_df[AutoEvalColumn.average.name] = pd.Series(dtype=float)
26
+ for col in benchmark_cols:
27
+ empty_df[col] = pd.Series(dtype=float)
28
+ return empty_df
29
+
30
+ df = pd.DataFrame.from_records(all_data_json)
31
+ print("\nCreated DataFrame with columns:", df.columns.tolist(), flush=True)
32
+ print("DataFrame shape:", df.shape, flush=True)
33
+
34
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
36
+ print("\nSorted DataFrame by average", flush=True)
37
+ except KeyError as e:
38
+ print(f"\nError sorting DataFrame: {e}", flush=True)
39
+ print("Available columns:", df.columns.tolist(), flush=True)
40
 
41
+ try:
42
+ df = df[cols].round(decimals=2)
43
+ print("\nSelected and rounded columns", flush=True)
44
+ except KeyError as e:
45
+ print(f"\nError selecting columns: {e}", flush=True)
46
+ print("Requested columns:", cols, flush=True)
47
+ print("Available columns:", df.columns.tolist(), flush=True)
48
+ # Create empty DataFrame with correct structure
49
  empty_df = pd.DataFrame(columns=cols)
50
  empty_df[AutoEvalColumn.average.name] = pd.Series(dtype=float)
51
  for col in benchmark_cols:
52
  empty_df[col] = pd.Series(dtype=float)
53
  return empty_df
54
+
55
+ # filter out if perplexity hasn't been evaluated
56
+ df = df[has_no_nan_values(df, benchmark_cols)]
57
+ print("\nFinal DataFrame shape after filtering:", df.shape, flush=True)
58
+ print("Final columns:", df.columns.tolist(), flush=True)
59
+
60
+ return df