Spaces:
Running
Running
File size: 7,735 Bytes
17aa8f3 982fdda bdbadad 90cb3d2 4bcc990 bdbadad 90cb3d2 bdbadad 90cb3d2 982fdda bdbadad 90cb3d2 bdbadad 90cb3d2 bdbadad 982fdda bdbadad 90cb3d2 4bcc990 90cb3d2 4bcc990 90cb3d2 bdbadad 90cb3d2 bdbadad 90cb3d2 bdbadad 90cb3d2 bdbadad 90cb3d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr
# Input data
data_full = [
["CultriX/Qwen2.5-14B-SLERPv7", 0.7205, 0.8272, 0.7541, 0.6581, 0.5000, 0.7290],
["djuna/Q2.5-Veltha-14B-0.5", 0.7492, 0.8386, 0.7305, 0.5980, 0.4300, 0.7817],
["CultriX/Qwen2.5-14B-FinalMerge", 0.7248, 0.8277, 0.7113, 0.7052, 0.5700, 0.7001],
["CultriX/Qwen2.5-14B-MultiCultyv2", 0.7295, 0.8359, 0.7363, 0.5767, 0.4400, 0.7316],
["CultriX/Qwen2.5-14B-Brocav7", 0.7445, 0.8353, 0.7508, 0.6292, 0.4600, 0.7629],
["CultriX/Qwen2.5-14B-Broca", 0.7456, 0.8352, 0.7480, 0.6034, 0.4400, 0.7716],
["CultriX/Qwen2.5-14B-Brocav3", 0.7395, 0.8388, 0.7393, 0.6405, 0.4700, 0.7659],
["CultriX/Qwen2.5-14B-Brocav4", 0.7432, 0.8377, 0.7444, 0.6277, 0.4800, 0.7580],
["CultriX/Qwen2.5-14B-Brocav2", 0.7492, 0.8302, 0.7508, 0.6377, 0.5100, 0.7478],
["CultriX/Qwen2.5-14B-Brocav5", 0.7445, 0.8313, 0.7547, 0.6376, 0.5000, 0.7304],
["CultriX/Qwen2.5-14B-Brocav6", 0.7179, 0.8354, 0.7531, 0.6378, 0.4900, 0.7524],
["CultriX/Qwenfinity-2.5-14B", 0.7347, 0.8254, 0.7279, 0.7267, 0.5600, 0.6970],
["CultriX/Qwen2.5-14B-Emergedv2", 0.7137, 0.8335, 0.7363, 0.5836, 0.4400, 0.7344],
["CultriX/Qwen2.5-14B-Unity", 0.7063, 0.8343, 0.7423, 0.6820, 0.5700, 0.7498],
["CultriX/Qwen2.5-14B-MultiCultyv3", 0.7132, 0.8216, 0.7395, 0.6792, 0.5500, 0.7120],
["CultriX/Qwen2.5-14B-Emergedv3", 0.7436, 0.8312, 0.7519, 0.6585, 0.5500, 0.7068],
["CultriX/SeQwence-14Bv1", 0.7278, 0.8410, 0.7541, 0.6816, 0.5200, 0.7539],
["CultriX/Qwen2.5-14B-Wernickev2", 0.7391, 0.8168, 0.7273, 0.6220, 0.4500, 0.7572],
["CultriX/Qwen2.5-14B-Wernickev3", 0.7357, 0.8148, 0.7245, 0.7023, 0.5500, 0.7869],
["CultriX/Qwen2.5-14B-Wernickev4", 0.7355, 0.8290, 0.7497, 0.6306, 0.4800, 0.7635],
["CultriX/SeQwential-14B-v1", 0.7355, 0.8205, 0.7549, 0.6367, 0.4800, 0.7626],
["CultriX/Qwen2.5-14B-Wernickev5", 0.7224, 0.8272, 0.7541, 0.6790, 0.5100, 0.7578],
["CultriX/Qwen2.5-14B-Wernickev6", 0.6994, 0.7549, 0.5816, 0.6991, 0.5800, 0.7267],
["CultriX/Qwen2.5-14B-Wernickev7", 0.7147, 0.7599, 0.6097, 0.7056, 0.5700, 0.7164],
["CultriX/Qwen2.5-14B-FinalMerge-tmp2", 0.7255, 0.8192, 0.7535, 0.6671, 0.5000, 0.7612],
]
columns = ["Model Configuration", "tinyArc", "tinyHellaswag", "tinyMMLU", "tinyTruthfulQA", "tinyTruthfulQA_mc1", "tinyWinogrande"]
# Convert to DataFrame
df_full = pd.DataFrame(data_full, columns=columns)
def plot_average_scores():
df_full["Average Score"] = df_full.iloc[:, 1:].mean(axis=1)
df_avg_sorted = df_full.sort_values(by="Average Score", ascending=False)
plt.figure(figsize=(12, 8))
plt.barh(df_avg_sorted["Model Configuration"], df_avg_sorted["Average Score"])
plt.title("Average Performance of Models Across Tasks", fontsize=16)
plt.xlabel("Average Score", fontsize=14)
plt.ylabel("Model Configuration", fontsize=14)
plt.gca().invert_yaxis()
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig("average_performance.png")
return "average_performance.png"
def plot_task_performance():
df_full_melted = df_full.melt(id_vars="Model Configuration", var_name="Task", value_name="Score")
plt.figure(figsize=(14, 10))
for model in df_full["Model Configuration"]:
model_data = df_full_melted[df_full_melted["Model Configuration"] == model]
plt.plot(model_data["Task"], model_data["Score"], marker="o", label=model)
plt.title("Performance of All Models Across Tasks", fontsize=16)
plt.xlabel("Task", fontsize=14)
plt.ylabel("Score", fontsize=14)
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig("task_performance.png")
return "task_performance.png"
def plot_task_specific_top_models():
top_models = df_full.iloc[:, :-1].set_index("Model Configuration").idxmax()
top_scores = df_full.iloc[:, :-1].set_index("Model Configuration").max()
results = pd.DataFrame({"Top Model": top_models, "Score": top_scores}).reset_index().rename(columns={"index": "Task"})
plt.figure(figsize=(12, 6))
plt.bar(results["Task"], results["Score"])
plt.title("Task-Specific Top Models", fontsize=16)
plt.xlabel("Task", fontsize=14)
plt.ylabel("Score", fontsize=14)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.savefig("task_specific_top_models.png")
return "task_specific_top_models.png"
def top_3_models_per_task():
top_3_data = {
task: df_full.nlargest(3, task)[["Model Configuration", task]].values.tolist()
for task in df_full.columns[1:-1]
}
top_3_results = pd.DataFrame({
task: {
"Top 3 Models": [entry[0] for entry in top_3_data[task]],
"Scores": [entry[1] for entry in top_3_data[task]],
}
for task in top_3_data
}).T.rename_axis("Task").reset_index()
return top_3_results
def summary_statistics():
stats = df_full.iloc[:, 1:].describe().T # Summary stats for each task
stats['Std Dev'] = df_full.iloc[:, 1:].std(axis=0)
return stats.reset_index()
def plot_distribution_boxplots():
plt.figure(figsize=(14, 8))
df_melted = df_full.melt(id_vars="Model Configuration", var_name="Task", value_name="Score")
sns.boxplot(x="Task", y="Score", data=df_melted)
plt.title("Score Distribution by Task", fontsize=16)
plt.xlabel("Task", fontsize=14)
plt.ylabel("Score", fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig("distribution_boxplots.png")
return "distribution_boxplots.png"
def best_overall_model():
df_full["Average Score"] = df_full.iloc[:, 1:].mean(axis=1)
best_model = df_full.loc[df_full["Average Score"].idxmax()]
return best_model
def plot_heatmap():
plt.figure(figsize=(12, 8))
sns.heatmap(df_full.iloc[:, 1:], annot=True, cmap="YlGnBu", xticklabels=columns[1:], yticklabels=df_full["Model Configuration"])
plt.title("Performance Heatmap", fontsize=16)
plt.tight_layout()
plt.savefig("performance_heatmap.png")
return "performance_heatmap.png"
with gr.Blocks() as demo:
gr.Markdown("# Model Performance Analysis")
with gr.Row():
btn1 = gr.Button("Show Average Performance")
img1 = gr.Image(type="filepath")
btn1.click(plot_average_scores, outputs=img1)
with gr.Row():
btn2 = gr.Button("Show Task Performance")
img2 = gr.Image(type="filepath")
btn2.click(plot_task_performance, outputs=img2)
with gr.Row():
btn3 = gr.Button("Task-Specific Top Models")
img3 = gr.Image(type="filepath")
btn3.click(plot_task_specific_top_models, outputs=img3)
with gr.Row():
btn4 = gr.Button("Top 3 Models Per Task")
output4 = gr.Dataframe()
btn4.click(top_3_models_per_task, outputs=output4)
with gr.Row():
btn1 = gr.Button("Show Summary Statistics")
stats_output = gr.Dataframe()
btn1.click(summary_statistics, outputs=stats_output)
with gr.Row():
btn2 = gr.Button("Plot Score Distributions")
dist_img = gr.Image(type="filepath")
btn2.click(plot_distribution_boxplots, outputs=dist_img)
with gr.Row():
btn3 = gr.Button("Best Overall Model")
best_output = gr.Textbox()
btn3.click(best_overall_model, outputs=best_output)
with gr.Row():
btn4 = gr.Button("Plot Performance Heatmap")
heatmap_img = gr.Image(type="filepath")
btn4.click(plot_heatmap, outputs=heatmap_img)
demo.launch()
|