import os import gradio as gr import numpy as np import matplotlib.pyplot as plt import seaborn as sns from io import BytesIO from PIL import Image print(gr.__version__) from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets from src.similarity import load_data_and_compute_similarities # Set matplotlib backend for non-GUI environments plt.switch_backend('Agg') def create_heatmap(selected_models, selected_dataset, selected_metric): if not selected_models or not selected_dataset: return None # Sort models and get short names selected_models = sorted(selected_models) similarities = load_data_and_compute_similarities(selected_models, selected_dataset, selected_metric) # Create figure and heatmap using seaborn plt.figure(figsize=(8, 6)) ax = sns.heatmap( similarities, annot=True, fmt=".2f", cmap="viridis", vmin=0, vmax=1, xticklabels=selected_models, yticklabels=selected_models ) # Customize plot plt.title(f"{selected_metric} for {selected_dataset}", fontsize=16) plt.xlabel("Models", fontsize=14) plt.ylabel("Models", fontsize=14) plt.xticks(rotation=45, ha='right') plt.yticks(rotation=0) plt.tight_layout() # Save to buffer buf = BytesIO() plt.savefig(buf, format="png", dpi=100, bbox_inches="tight") plt.close() # Convert to PIL Image buf.seek(0) img = Image.open(buf).convert("RGB") return img def validate_inputs(selected_models, selected_dataset): if not selected_models: raise gr.Error("Please select at least one model!") if not selected_dataset: raise gr.Error("Please select a dataset!") def update_datasets_based_on_models(selected_models, current_dataset): available_datasets = get_leaderboard_datasets(selected_models) if selected_models else [] valid_dataset = current_dataset if current_dataset in available_datasets else None return gr.update( choices=available_datasets, value=valid_dataset ) with gr.Blocks(title="LLM Similarity Analyzer") as demo: gr.Markdown("## Model Similarity Comparison Tool") with gr.Row(): dataset_dropdown = gr.Dropdown( choices=get_leaderboard_datasets(None), label="Select Dataset", filterable=True, interactive=True, allow_custom_value=False, info="Open LLM Leaderboard v2 benchmark datasets" ) metric_dropdown = gr.Dropdown( choices=["Kappa_p (prob.)", "Kappa_p (det.)", "Error Consistency"], label="Select Metric", info="Select a similarity metric to compute" ) model_dropdown = gr.Dropdown( choices=get_leaderboard_models_cached(), label="Select Models", multiselect=True, filterable=True, allow_custom_value=False, info="Search and select multiple models" ) model_dropdown.change( fn=update_datasets_based_on_models, inputs=[model_dropdown, dataset_dropdown], outputs=dataset_dropdown ) generate_btn = gr.Button("Generate Heatmap", variant="primary") heatmap = gr.Image(label="Similarity Heatmap", visible=True) generate_btn.click( fn=validate_inputs, inputs=[model_dropdown, dataset_dropdown], queue=False ).then( fn=create_heatmap, inputs=[model_dropdown, dataset_dropdown, metric_dropdown], outputs=heatmap ) clear_btn = gr.Button("Clear Selection") clear_btn.click( lambda: [[], None, None], outputs=[model_dropdown, dataset_dropdown, heatmap] ) if __name__ == "__main__": demo.launch(ssr_mode=False)