Spaces:
Running
Running
File size: 8,010 Bytes
982fdda 17aa8f3 982fdda 17aa8f3 38e1340 982fdda 7b7eb30 982fdda 17aa8f3 982fdda 38e1340 982fdda 38e1340 982fdda b8253d2 17aa8f3 982fdda 144fe6c b8253d2 6f821b5 b8253d2 7b7eb30 03bdf6c 7b7eb30 b8253d2 03bdf6c 7b7eb30 b8253d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.io as pio
from io import StringIO
import base64
# Read the data from the file
def parse_data(file_content):
lines = file_content.splitlines()
model_data = []
current_model = None
for line in lines:
line = line.strip()
if line.startswith('hf (pretrained='):
current_model = line.split('pretrained=')[1].split(',')[0]
elif line and current_model:
if '|' in line:
# Parse table row
parts = [p.strip() for p in line.split('|')]
if len(parts) >= 2: # Ensure the correct number of columns
try:
task_name = parts[0]
value = float(parts[1]) # Extract the numeric value
model_data.append([
current_model,
task_name, # Task name
value
])
except ValueError:
print(f"Skipping row due to invalid value: {parts}")
if not model_data:
print("No valid data found in the file.")
return pd.DataFrame(model_data, columns=['Model', 'Task', 'Value'])
# Calculate average performance
def calculate_averages(data):
if data.empty:
print("No data available to calculate averages.")
return pd.DataFrame(columns=['Model', 'Average Performance'])
return data.groupby('Model')['Value'].mean().reset_index().rename(columns={'Value': 'Average Performance'})
def create_bar_chart(df, category):
"""Create a horizontal bar chart for the specified category."""
sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
fig = go.Figure(go.Bar(
x=sorted_df[category],
y=sorted_df['Model'],
orientation='h',
marker=dict(color=sorted_df[category], colorscale='Viridis'),
hoverinfo='x+y',
text=sorted_df[category],
textposition='auto'
))
fig.update_layout(
margin=dict(l=20, r=20, t=20, b=20),
title=f"Leaderboard for {category} Scores"
)
return fig
def generate_visualizations(data, averages):
sns.set(style='whitegrid')
if averages.empty:
print("No averages to visualize.")
return None, None, None, None, None, None
averages = averages.sort_values(by='Average Performance')
# Matplotlib average performance plot
plt.figure(figsize=(12, 8))
sns.barplot(data=averages, x='Average Performance', y='Model', palette='viridis')
plt.title('Average Performance of Models', fontsize=16)
plt.xlabel('Average Performance', fontsize=12)
plt.ylabel('Model', fontsize=12)
plt.tight_layout()
# Save the plot to a buffer
buffer_avg = StringIO()
plt.savefig(buffer_avg, format='png')
buffer_avg.seek(0)
image_avg = base64.b64encode(buffer_avg.read()).decode('utf-8')
plt.close()
# Line plot for task performance by model
sorted_models = averages['Model'].tolist()
data['Model'] = pd.Categorical(data['Model'], categories=sorted_models, ordered=True)
data = data.sort_values(by=['Model', 'Task'])
if data.empty:
print("No data available for line plot.")
return image_avg, None, None, None, None, None
plt.figure(figsize=(14, 10))
sns.lineplot(data=data, x='Task', y='Value', hue='Model', marker='o')
plt.title('Task Performance by Model', fontsize=16)
plt.xlabel('Task', fontsize=12)
plt.ylabel('Performance', fontsize=12)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Model')
plt.xticks(rotation=45)
plt.tight_layout()
# Save the line plot to a buffer
buffer_line = StringIO()
plt.savefig(buffer_line, format='png')
buffer_line.seek(0)
image_line = base64.b64encode(buffer_line.read()).decode('utf-8')
plt.close()
# Heatmap of task performance
pivot_table = data.pivot_table(index='Task', columns='Model', values='Value')
plt.figure(figsize=(12, 10))
sns.heatmap(pivot_table, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title('Task Performance Heatmap', fontsize=16)
plt.xlabel('Model', fontsize=12)
plt.ylabel('Task', fontsize=12)
plt.tight_layout()
# Save the heatmap to a buffer
buffer_heatmap = StringIO()
plt.savefig(buffer_heatmap, format='png')
buffer_heatmap.seek(0)
image_heatmap = base64.b64encode(buffer_heatmap.read()).decode('utf-8')
plt.close()
# Boxplot of performance distribution per model
plt.figure(figsize=(12, 8))
sns.boxplot(data=data, x='Model', y='Value', palette='Set2')
plt.title('Performance Distribution per Model', fontsize=16)
plt.xlabel('Model', fontsize=12)
plt.ylabel('Performance', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
# Save the boxplot to a buffer
buffer_boxplot = StringIO()
plt.savefig(buffer_boxplot, format='png')
buffer_boxplot.seek(0)
image_boxplot = base64.b64encode(buffer_boxplot.read()).decode('utf-8')
plt.close()
# Create plotly bar charts
fig1 = create_bar_chart(averages, 'Average Performance')
plotly_avg = pio.to_html(fig1, full_html=False)
plotly_tasks = {}
# Assuming you have tasks in the dataframe and want to display it
tasks = data['Task'].unique()
for task in tasks:
task_data = data[data['Task'] == task]
fig2 = create_bar_chart(task_data, 'Value')
fig2.update_layout(title=f"Leaderboard for {task} Scores")
plotly_tasks[task] = pio.to_html(fig2, full_html=False)
return image_avg, image_line, image_heatmap, image_boxplot, plotly_avg, plotly_tasks
def process_and_visualize(file_content):
data = parse_data(file_content)
averages = calculate_averages(data)
image_avg, image_line, image_heatmap, image_boxplot, plotly_avg, plotly_tasks = generate_visualizations(data, averages)
output_text = f"Average Performance per Model:\n{averages.sort_values(by='Average Performance').to_string()}"
return output_text, image_avg, image_line, image_heatmap, image_boxplot, plotly_avg, plotly_tasks
if __name__ == "__main__":
task_names = ['tinyArc', 'tinyHellaswag', 'tinyMMLU', 'tinyTruthfulQA', 'tinyTruthfulQA_mc1', 'tinyWinogrande']
with gr.Blocks(title="LLM Benchmark Visualizer") as demo:
gr.Markdown("Upload your LLM benchmark data and visualize the results.")
with gr.Row():
input_text = gr.Textbox(lines=10, label="Paste your data here")
with gr.Row():
output_text = gr.Textbox(label="Average Performance per Model")
with gr.Row():
with gr.Column():
image_avg = gr.Image(label="Matplotlib Average Performance Chart")
image_line = gr.Image(label="Matplotlib Task Performance Line Chart")
with gr.Column():
image_heatmap = gr.Image(label="Matplotlib Task Performance Heatmap")
image_boxplot = gr.Image(label="Matplotlib Performance Distribution Boxplot")
with gr.Row():
plotly_avg = gr.HTML(label="Plotly Average Performance Chart")
task_tabs = gr.TabbedInterface([])
def update_tabs(file_content):
_, _, _, _, _, _, plotly_tasks = process_and_visualize(file_content)
return [gr.HTML(value=html, label=task) for task, html in plotly_tasks.items()]
input_text.change(
fn=process_and_visualize,
inputs=input_text,
outputs=[output_text, image_avg, image_line, image_heatmap, image_boxplot, plotly_avg],
)
input_text.change(fn=update_tabs, inputs=input_text, outputs=[task_tabs])
demo.launch(share=True)
|