Ahmed Ahmed commited on
Commit
24c8512
·
1 Parent(s): 77c0f20

consolidate

Browse files
Files changed (2) hide show
  1. app.py +29 -9
  2. src/populate.py +26 -8
app.py CHANGED
@@ -1,7 +1,8 @@
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard
3
  import pandas as pd
4
- from huggingface_hub import snapshot_download
 
5
 
6
  from src.about import (
7
  INTRODUCTION_TEXT,
@@ -14,13 +15,13 @@ from src.display.utils import (
14
  COLS,
15
  AutoEvalColumn,
16
  )
17
- from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN
18
  from src.populate import get_leaderboard_df
19
  from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
20
 
21
  def init_leaderboard(dataframe):
22
- if dataframe is None or dataframe.empty:
23
- raise ValueError("Leaderboard DataFrame is empty or None.")
24
 
25
  return Leaderboard(
26
  dataframe,
@@ -42,14 +43,33 @@ def run_perplexity_test(model_name, revision, precision):
42
  else:
43
  return f"❌ Evaluation failed: {result}"
44
 
45
- # Initialize results directory
46
  try:
47
- print(EVAL_RESULTS_PATH)
48
- snapshot_download(
49
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
50
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  except Exception as e:
52
  print(f"Error initializing results: {e}")
 
 
53
 
54
  # Get initial leaderboard data
55
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
 
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard
3
  import pandas as pd
4
+ from huggingface_hub import snapshot_download, create_repo, RepoNotFoundError
5
+ import os
6
 
7
  from src.about import (
8
  INTRODUCTION_TEXT,
 
15
  COLS,
16
  AutoEvalColumn,
17
  )
18
+ from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN, OWNER
19
  from src.populate import get_leaderboard_df
20
  from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
21
 
22
  def init_leaderboard(dataframe):
23
+ if dataframe is None:
24
+ raise ValueError("Leaderboard DataFrame is None.")
25
 
26
  return Leaderboard(
27
  dataframe,
 
43
  else:
44
  return f"❌ Evaluation failed: {result}"
45
 
46
+ # Initialize results repository and directory
47
  try:
48
+ # Try to download existing repository
49
+ try:
50
+ snapshot_download(
51
+ repo_id=RESULTS_REPO,
52
+ local_dir=EVAL_RESULTS_PATH,
53
+ repo_type="dataset",
54
+ tqdm_class=None,
55
+ etag_timeout=30,
56
+ token=TOKEN
57
+ )
58
+ except RepoNotFoundError:
59
+ # Create the repository if it doesn't exist
60
+ print(f"Creating new results repository: {RESULTS_REPO}")
61
+ create_repo(
62
+ repo_id=RESULTS_REPO,
63
+ repo_type="dataset",
64
+ private=False,
65
+ token=TOKEN
66
+ )
67
+ # Create local directory
68
+ os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
69
  except Exception as e:
70
  print(f"Error initializing results: {e}")
71
+ # Ensure local directory exists even if repo operations fail
72
+ os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
73
 
74
  # Get initial leaderboard data
75
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
src/populate.py CHANGED
@@ -5,13 +5,31 @@ from src.leaderboard.read_evals import get_raw_eval_results
5
 
6
  def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
7
  """Creates a dataframe from all the individual experiment results"""
8
- raw_data = get_raw_eval_results(results_path)
9
- all_data_json = [v.to_dict() for v in raw_data]
 
10
 
11
- df = pd.DataFrame.from_records(all_data_json)
12
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
13
- df = df[cols].round(decimals=2)
 
 
 
 
 
14
 
15
- # filter out if perplexity hasn't been evaluated
16
- df = df[has_no_nan_values(df, benchmark_cols)]
17
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
7
  """Creates a dataframe from all the individual experiment results"""
8
+ try:
9
+ raw_data = get_raw_eval_results(results_path)
10
+ all_data_json = [v.to_dict() for v in raw_data]
11
 
12
+ if not all_data_json:
13
+ # Create empty DataFrame with correct columns
14
+ empty_df = pd.DataFrame(columns=cols)
15
+ # Ensure correct column types
16
+ empty_df[AutoEvalColumn.average.name] = pd.Series(dtype=float)
17
+ for col in benchmark_cols:
18
+ empty_df[col] = pd.Series(dtype=float)
19
+ return empty_df
20
 
21
+ df = pd.DataFrame.from_records(all_data_json)
22
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
23
+ df = df[cols].round(decimals=2)
24
+
25
+ # filter out if perplexity hasn't been evaluated
26
+ df = df[has_no_nan_values(df, benchmark_cols)]
27
+ return df
28
+ except Exception as e:
29
+ print(f"Error creating leaderboard: {e}")
30
+ # Return empty DataFrame with correct structure
31
+ empty_df = pd.DataFrame(columns=cols)
32
+ empty_df[AutoEvalColumn.average.name] = pd.Series(dtype=float)
33
+ for col in benchmark_cols:
34
+ empty_df[col] = pd.Series(dtype=float)
35
+ return empty_df