Joschka Strueber commited on
Commit
f3cd231
·
1 Parent(s): 88e5618

[Add, Ref] pairwise sim, data loading, simple number example demo

Browse files
Files changed (5) hide show
  1. app.py +47 -80
  2. app_heatmap.py +103 -0
  3. requirements.txt +2 -1
  4. src/dataloading.py +46 -3
  5. src/similarity.py +25 -0
app.py CHANGED
@@ -1,103 +1,70 @@
1
  import gradio as gr
2
- import plotly.graph_objects as go
3
- import numpy as np
4
  from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets
 
5
 
6
- # Optionally, force a renderer (may or may not help)
7
- import plotly.io as pio
8
- pio.renderers.default = "iframe"
 
 
 
 
9
 
10
- def create_heatmap(selected_models, selected_dataset):
11
- if not selected_models or not selected_dataset:
12
- return "" # Return empty HTML if no input
13
- size = len(selected_models)
14
- similarities = np.random.rand(size, size)
15
- similarities = (similarities + similarities.T) / 2
16
- similarities = np.round(similarities, 2)
17
-
18
- fig = go.Figure(data=go.Heatmap(
19
- z=similarities,
20
- x=selected_models,
21
- y=selected_models,
22
- colorscale="Viridis",
23
- zmin=0, zmax=1,
24
- text=similarities,
25
- hoverinfo="text"
26
- ))
27
 
28
- fig.update_layout(
29
- title=f"Similarity Matrix for {selected_dataset}",
30
- xaxis_title="Models",
31
- yaxis_title="Models",
32
- width=800,
33
- height=800,
34
- margin=dict(l=100, r=100, t=100, b=100)
35
  )
36
 
37
- # Force categorical ordering with explicit tick settings.
38
- fig.update_xaxes(
39
- type="category",
40
- categoryorder="array",
41
- categoryarray=selected_models,
42
- tickangle=45,
43
- automargin=True
44
  )
45
- fig.update_yaxes(
46
- type="category",
47
- categoryorder="array",
48
- categoryarray=selected_models,
49
- automargin=True
50
- )
51
-
52
- # Convert the figure to an HTML string that includes Plotly.js via CDN.
53
- return fig.to_html(full_html=False, include_plotlyjs="cdn")
54
 
55
- def validate_inputs(selected_models, selected_dataset):
56
- if not selected_models:
57
- raise gr.Error("Please select at least one model!")
58
- if not selected_dataset:
59
- raise gr.Error("Please select a dataset!")
 
 
60
 
61
- with gr.Blocks(title="LLM Similarity Analyzer") as demo:
62
- gr.Markdown("## Model Similarity Comparison Tool")
63
-
64
- with gr.Row():
65
- dataset_dropdown = gr.Dropdown(
66
- choices=get_leaderboard_datasets(),
67
- label="Select Dataset",
68
- filterable=True,
69
- interactive=True,
70
- info="Leaderboard benchmark datasets"
71
- )
72
- model_dropdown = gr.Dropdown(
73
- choices=get_leaderboard_models_cached(),
74
- label="Select Models",
75
- multiselect=True,
76
- filterable=True,
77
- allow_custom_value=False,
78
- info="Search and select multiple models"
79
- )
80
 
81
- generate_btn = gr.Button("Generate Heatmap", variant="primary")
82
- # Use an HTML component instead of gr.Plot.
83
- heatmap = gr.HTML(label="Similarity Heatmap", visible=True)
 
 
84
 
85
  generate_btn.click(
86
  fn=validate_inputs,
87
- inputs=[model_dropdown, dataset_dropdown],
88
  queue=False
89
  ).then(
90
- fn=create_heatmap,
91
- inputs=[model_dropdown, dataset_dropdown],
92
- outputs=heatmap
93
  )
94
 
95
  clear_btn = gr.Button("Clear Selection")
96
  clear_btn.click(
97
- lambda: [None, None, ""],
98
- outputs=[model_dropdown, dataset_dropdown, heatmap]
99
  )
100
 
101
  if __name__ == "__main__":
102
- # On Spaces, disable server-side rendering.
103
- demo.launch(ssr_mode=False)
 
1
  import gradio as gr
 
 
2
  from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets
3
+ from src.similarity import compute_similarity
4
 
5
+ def validate_inputs(selected_model_a, selected_model_b, selected_dataset):
6
+ if not selected_model_a:
7
+ raise gr.Error("Please select Model A!")
8
+ if not selected_model_b:
9
+ raise gr.Error("Please select Model B!")
10
+ if not selected_dataset:
11
+ raise gr.Error("Please select a dataset!")
12
 
13
+ def display_similarity(model_a, model_b, dataset):
14
+ # Assuming compute_similarity returns a float or a string
15
+ similarity_score = compute_similarity(model_a, model_b, dataset)
16
+ return f"The similarity between {model_a} and {model_b} on {dataset} is: {similarity_score}"
17
+
18
+ with gr.Blocks(title="LLM Similarity Analyzer") as demo:
19
+ gr.Markdown("## Model Similarity Comparison Tool")
 
 
 
 
 
 
 
 
 
 
20
 
21
+ dataset_dropdown = gr.Dropdown(
22
+ choices=get_leaderboard_datasets(),
23
+ label="Select Dataset",
24
+ filterable=True,
25
+ interactive=True,
26
+ info="Leaderboard benchmark datasets"
 
27
  )
28
 
29
+ model_a_dropdown = gr.Dropdown(
30
+ choices=get_leaderboard_models_cached(),
31
+ label="Select Model A",
32
+ filterable=True,
33
+ allow_custom_value=False,
34
+ info="Search and select models"
 
35
  )
 
 
 
 
 
 
 
 
 
36
 
37
+ model_b_dropdown = gr.Dropdown(
38
+ choices=get_leaderboard_models_cached(),
39
+ label="Select Model B",
40
+ filterable=True,
41
+ allow_custom_value=False,
42
+ info="Search and select models"
43
+ )
44
 
45
+ generate_btn = gr.Button("Compute Similarity", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ # Textbox to display the similarity result
48
+ similarity_output = gr.Textbox(
49
+ label="Similarity Result",
50
+ interactive=False
51
+ )
52
 
53
  generate_btn.click(
54
  fn=validate_inputs,
55
+ inputs=[model_a_dropdown, model_b_dropdown, dataset_dropdown],
56
  queue=False
57
  ).then(
58
+ fn=display_similarity,
59
+ inputs=[model_a_dropdown, model_b_dropdown, dataset_dropdown],
60
+ outputs=similarity_output
61
  )
62
 
63
  clear_btn = gr.Button("Clear Selection")
64
  clear_btn.click(
65
+ lambda: [None, None, None, ""],
66
+ outputs=[model_a_dropdown, model_b_dropdown, dataset_dropdown, similarity_output]
67
  )
68
 
69
  if __name__ == "__main__":
70
+ demo.launch()
 
app_heatmap.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import plotly.graph_objects as go
3
+ import numpy as np
4
+ from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets
5
+
6
+ # Optionally, force a renderer (may or may not help)
7
+ import plotly.io as pio
8
+ pio.renderers.default = "iframe"
9
+
10
+ def create_heatmap(selected_models, selected_dataset):
11
+ if not selected_models or not selected_dataset:
12
+ return "" # Return empty HTML if no input
13
+ size = len(selected_models)
14
+ similarities = np.random.rand(size, size)
15
+ similarities = (similarities + similarities.T) / 2
16
+ similarities = np.round(similarities, 2)
17
+
18
+ fig = go.Figure(data=go.Heatmap(
19
+ z=similarities,
20
+ x=selected_models,
21
+ y=selected_models,
22
+ colorscale="Viridis",
23
+ zmin=0, zmax=1,
24
+ text=similarities,
25
+ hoverinfo="text"
26
+ ))
27
+
28
+ fig.update_layout(
29
+ title=f"Similarity Matrix for {selected_dataset}",
30
+ xaxis_title="Models",
31
+ yaxis_title="Models",
32
+ width=800,
33
+ height=800,
34
+ margin=dict(l=100, r=100, t=100, b=100)
35
+ )
36
+
37
+ # Force categorical ordering with explicit tick settings.
38
+ fig.update_xaxes(
39
+ type="category",
40
+ categoryorder="array",
41
+ categoryarray=selected_models,
42
+ tickangle=45,
43
+ automargin=True
44
+ )
45
+ fig.update_yaxes(
46
+ type="category",
47
+ categoryorder="array",
48
+ categoryarray=selected_models,
49
+ automargin=True
50
+ )
51
+
52
+ # Convert the figure to an HTML string that includes Plotly.js via CDN.
53
+ return fig.to_html(full_html=False, include_plotlyjs="cdn")
54
+
55
+ def validate_inputs(selected_models, selected_dataset):
56
+ if not selected_models:
57
+ raise gr.Error("Please select at least one model!")
58
+ if not selected_dataset:
59
+ raise gr.Error("Please select a dataset!")
60
+
61
+ with gr.Blocks(title="LLM Similarity Analyzer") as demo:
62
+ gr.Markdown("## Model Similarity Comparison Tool")
63
+
64
+ with gr.Row():
65
+ dataset_dropdown = gr.Dropdown(
66
+ choices=get_leaderboard_datasets(),
67
+ label="Select Dataset",
68
+ filterable=True,
69
+ interactive=True,
70
+ info="Leaderboard benchmark datasets"
71
+ )
72
+ model_dropdown = gr.Dropdown(
73
+ choices=get_leaderboard_models_cached(),
74
+ label="Select Models",
75
+ multiselect=True,
76
+ filterable=True,
77
+ allow_custom_value=False,
78
+ info="Search and select multiple models"
79
+ )
80
+
81
+ generate_btn = gr.Button("Generate Heatmap", variant="primary")
82
+ # Use an HTML component instead of gr.Plot.
83
+ heatmap = gr.HTML(label="Similarity Heatmap", visible=True)
84
+
85
+ generate_btn.click(
86
+ fn=validate_inputs,
87
+ inputs=[model_dropdown, dataset_dropdown],
88
+ queue=False
89
+ ).then(
90
+ fn=create_heatmap,
91
+ inputs=[model_dropdown, dataset_dropdown],
92
+ outputs=heatmap
93
+ )
94
+
95
+ clear_btn = gr.Button("Clear Selection")
96
+ clear_btn.click(
97
+ lambda: [None, None, ""],
98
+ outputs=[model_dropdown, dataset_dropdown, heatmap]
99
+ )
100
+
101
+ if __name__ == "__main__":
102
+ # On Spaces, disable server-side rendering.
103
+ demo.launch(ssr_mode=False)
requirements.txt CHANGED
@@ -5,4 +5,5 @@ matplotlib
5
  seaborn
6
  plotly
7
  pandas
8
- scienceplots
 
 
5
  seaborn
6
  plotly
7
  pandas
8
+ scienceplots
9
+ lm-sim
src/dataloading.py CHANGED
@@ -1,9 +1,12 @@
 
 
 
1
  from huggingface_hub import HfApi
2
  from functools import lru_cache
3
 
4
 
5
  def get_leaderboard_models():
6
- api = HfApi()
7
 
8
  # List all datasets in the open-llm-leaderboard organization
9
  #datasets = api.list_datasets(author="open-llm-leaderboard")
@@ -38,8 +41,48 @@ def get_leaderboard_datasets():
38
  return [
39
  "ai2_arc",
40
  "hellaswag",
41
- "mmlu",
42
  "truthful_qa",
43
  "winogrande",
44
  "gsm8k"
45
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import numpy as np
3
+
4
  from huggingface_hub import HfApi
5
  from functools import lru_cache
6
 
7
 
8
  def get_leaderboard_models():
9
+ #api = HfApi()
10
 
11
  # List all datasets in the open-llm-leaderboard organization
12
  #datasets = api.list_datasets(author="open-llm-leaderboard")
 
41
  return [
42
  "ai2_arc",
43
  "hellaswag",
44
+ "mmlu_pro",
45
  "truthful_qa",
46
  "winogrande",
47
  "gsm8k"
48
+ ]
49
+
50
+ def filter_labels(doc):
51
+ labels = []
52
+ if "answer_index" in doc[0].keys():
53
+ for d in doc:
54
+ labels.append(int(d["answer_index"]))
55
+ else:
56
+ for d in doc:
57
+ if d["answer"] == "False":
58
+ labels.append(0)
59
+ elif d["answer"] == "True":
60
+ labels.append(1)
61
+ else:
62
+ raise ValueError("Invalid label")
63
+
64
+ def load_run_data(model_name, dataset_name):
65
+ try:
66
+ model_name = model_name.replace("/", "__")
67
+
68
+ data = datasets.load_dataset("open-llm-leaderboard/" + model_name + "-details",
69
+ name=model_name + "__leaderboard_" + dataset_name,
70
+ split="latest")
71
+ data = data.sort("doc_id")
72
+ data = data.to_dict()
73
+
74
+ # Get log probabilities for each response
75
+ log_probs = []
76
+ for resp in data["filtered_resps"]:
77
+ log_prob = np.array([float(option[0]) for option in resp])
78
+ log_probs.append(log_prob)
79
+
80
+ # Get ground truth labels
81
+ labels = filter_labels(data["doc"])
82
+
83
+ except Exception as e:
84
+ print(e)
85
+ log_probs = None
86
+ labels = None
87
+
88
+ return log_probs, labels
src/similarity.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataloading import load_run_data
2
+ from lmsim.metrics import Kappa_p
3
+
4
+
5
+ def compute_similarity(selected_model_a, selected_model_b, selected_dataset):
6
+ probs_a, gt_a = load_run_data(selected_model_a, selected_dataset)
7
+ probs_b, gt_b = load_run_data(selected_model_b, selected_dataset)
8
+
9
+ assert len(probs_a) == len(probs_b), "Models must have the same number of responses"
10
+
11
+ # Only keep responses where the ground truth is the same
12
+ output_a = []
13
+ output_b = []
14
+ gt = []
15
+ for i in range(len(probs_a)):
16
+ if gt_a == gt_b:
17
+ output_a.append(probs_a[i])
18
+ output_b.append(probs_b[i])
19
+ gt.append(gt_a[i])
20
+
21
+ # Placeholder similarity value
22
+ kappa_p = Kappa_p()
23
+ similarity = kappa_p.compute_k(output_a, output_b, gt)
24
+
25
+ return similarity