Joschka Strueber commited on
Commit
0f7de99
·
1 Parent(s): fc18b54

[Add, Ref] integrate similarity computation, fix one-hot for EC, add login option

Browse files
app.py CHANGED
@@ -1,17 +1,21 @@
 
1
  import gradio as gr
2
  import numpy as np
3
  import matplotlib.pyplot as plt
4
  import seaborn as sns
5
  from io import BytesIO
6
  from PIL import Image
7
- from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets
8
  from huggingface_hub import login
9
 
 
 
 
10
  # Set matplotlib backend for non-GUI environments
11
  plt.switch_backend('Agg')
12
 
13
  # Login to Hugging Face Hub
14
- login(token="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")
 
15
 
16
 
17
  def create_heatmap(selected_models, selected_dataset, selected_metric):
@@ -20,13 +24,8 @@ def create_heatmap(selected_models, selected_dataset, selected_metric):
20
 
21
  # Sort models and get short names
22
  selected_models = sorted(selected_models)
23
- selected_models_short = [model.split("/")[-1] for model in selected_models]
24
 
25
- # Generate random similarity matrix
26
- size = len(selected_models)
27
- similarities = np.random.rand(size, size)
28
- similarities = (similarities + similarities.T) / 2
29
- similarities = np.round(similarities, 2)
30
 
31
  # Create figure and heatmap using seaborn
32
  plt.figure(figsize=(8, 6))
@@ -37,8 +36,8 @@ def create_heatmap(selected_models, selected_dataset, selected_metric):
37
  cmap="viridis",
38
  vmin=0,
39
  vmax=1,
40
- xticklabels=selected_models_short,
41
- yticklabels=selected_models_short
42
  )
43
 
44
  # Customize plot
 
1
+ import os
2
  import gradio as gr
3
  import numpy as np
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
6
  from io import BytesIO
7
  from PIL import Image
 
8
  from huggingface_hub import login
9
 
10
+ from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets
11
+ from src.similarity import load_data_and_compute_similarities
12
+
13
  # Set matplotlib backend for non-GUI environments
14
  plt.switch_backend('Agg')
15
 
16
  # Login to Hugging Face Hub
17
+ token = os.getenv("HF_TOKEN")
18
+ login(token=token)
19
 
20
 
21
  def create_heatmap(selected_models, selected_dataset, selected_metric):
 
24
 
25
  # Sort models and get short names
26
  selected_models = sorted(selected_models)
 
27
 
28
+ similarities = load_data_and_compute_similarities(selected_models, selected_dataset, selected_metric)
 
 
 
 
29
 
30
  # Create figure and heatmap using seaborn
31
  plt.figure(figsize=(8, 6))
 
36
  cmap="viridis",
37
  vmin=0,
38
  vmax=1,
39
+ xticklabels=selected_models,
40
+ yticklabels=selected_models
41
  )
42
 
43
  # Customize plot
src/__pycache__/dataloading.cpython-311.pyc ADDED
Binary file (5.8 kB). View file
 
src/dataloading.py CHANGED
@@ -57,7 +57,7 @@ def filter_labels(doc):
57
  labels = []
58
  if "answer_index" in doc[0].keys():
59
  for d in doc:
60
- labels.append(int(d["answer_index"]))
61
  else:
62
  for d in doc:
63
  if d["answer"] == "False":
@@ -66,6 +66,7 @@ def filter_labels(doc):
66
  labels.append(1)
67
  else:
68
  raise ValueError("Invalid label")
 
69
 
70
  def load_run_data(model_name, dataset_name):
71
  try:
@@ -94,10 +95,6 @@ def load_run_data(model_name, dataset_name):
94
  return log_probs, labels
95
 
96
 
97
- if __name__ == "__main__":
98
- model_ids = [
99
- 'Qwen/Qwen2.5-7B-Instruct'
100
- ]
101
-
102
- datasets = get_leaderboard_datasets(model_ids)
103
- print(datasets)
 
57
  labels = []
58
  if "answer_index" in doc[0].keys():
59
  for d in doc:
60
+ labels.append(d["answer_index"])
61
  else:
62
  for d in doc:
63
  if d["answer"] == "False":
 
66
  labels.append(1)
67
  else:
68
  raise ValueError("Invalid label")
69
+ return labels
70
 
71
  def load_run_data(model_name, dataset_name):
72
  try:
 
95
  return log_probs, labels
96
 
97
 
98
+
99
+
100
+
 
 
 
 
src/similarity.py CHANGED
@@ -1,11 +1,10 @@
1
  import numpy as np
2
 
3
- from src.dataloading import load_run_data
4
- from lmsim.metrics import Metric, Kappa_p, EC
5
 
6
 
7
-
8
- def load_data_and_compute_similarities(models, dataset, metric_name):
9
  # Load data
10
  probs = []
11
  gts = []
@@ -15,11 +14,11 @@ def load_data_and_compute_similarities(models, dataset, metric_name):
15
  gts.append(model_gt)
16
 
17
  # Compute pairwise similarities
18
- similarities = compute_pairwise_similarities(probs, gts, metric_name)
19
  return similarities
20
 
21
 
22
- def compute_similarity(metric: Metric, probs_a: list[np.array], gt_a: list[int], probs_b: list[np.array], gt_b: list[int]) -> float:
23
  # Check that the models have the same number of responses
24
  assert len(probs_a) == len(probs_b), f"Models must have the same number of responses: {len(probs_a)} != {len(probs_b)}"
25
 
@@ -44,16 +43,22 @@ def compute_pairwise_similarities(metric_name: str, probs: list[list[np.array]],
44
  if metric_name == "Kappa_p (prob.)":
45
  metric = Kappa_p()
46
  elif metric_name == "Kappa_p (det.)":
47
- metric = Kappa_p()
 
 
48
  elif metric_name == "Error Consistency":
49
  metric = EC()
50
  else:
51
- raise ValueError(f"Invalid metric: {metric_name}")
52
-
53
 
54
  similarities = np.zeros((len(probs), len(probs)))
55
  for i in range(len(probs)):
56
  for j in range(i, len(probs)):
57
  similarities[i, j] = compute_similarity(metric, probs[i], gts[i], probs[j], gts[j])
58
  similarities[j, i] = similarities[i, j]
59
- return similarities
 
 
 
 
 
 
1
  import numpy as np
2
 
3
+ from dataloading import load_run_data, get_leaderboard_datasets
4
+ from lmsim.metrics import Metrics, Kappa_p, EC
5
 
6
 
7
+ def load_data_and_compute_similarities(models: list[str], dataset: str, metric_name: str) -> np.array:
 
8
  # Load data
9
  probs = []
10
  gts = []
 
14
  gts.append(model_gt)
15
 
16
  # Compute pairwise similarities
17
+ similarities = compute_pairwise_similarities(metric_name, probs, gts)
18
  return similarities
19
 
20
 
21
+ def compute_similarity(metric: Metrics, probs_a: list[np.array], gt_a: list[int], probs_b: list[np.array], gt_b: list[int]) -> float:
22
  # Check that the models have the same number of responses
23
  assert len(probs_a) == len(probs_b), f"Models must have the same number of responses: {len(probs_a)} != {len(probs_b)}"
24
 
 
43
  if metric_name == "Kappa_p (prob.)":
44
  metric = Kappa_p()
45
  elif metric_name == "Kappa_p (det.)":
46
+ metric = Kappa_p(prob=False)
47
+ # Convert probabilities to one-hot
48
+ probs = [[one_hot(p) for p in model_probs] for model_probs in probs]
49
  elif metric_name == "Error Consistency":
50
  metric = EC()
51
  else:
52
+ raise ValueError(f"Invalid metric: {metric_name}")
 
53
 
54
  similarities = np.zeros((len(probs), len(probs)))
55
  for i in range(len(probs)):
56
  for j in range(i, len(probs)):
57
  similarities[i, j] = compute_similarity(metric, probs[i], gts[i], probs[j], gts[j])
58
  similarities[j, i] = similarities[i, j]
59
+ return similarities
60
+
61
+ def one_hot(probs: np.array) -> np.array:
62
+ one_hot = np.zeros_like(probs)
63
+ one_hot[np.argmax(probs)] = 1
64
+ return one_hot