Spaces:

llm-council
/

alpaca-eval-explorer

Running

App Files Files Community

alpaca-eval-explorer / app.py

justinxzhao

Initial version of AlpacaEval Visualizations

ca1e7f4 9 months ago

raw

history blame

7.81 kB

	import streamlit as st
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import statsmodels.api as sm

	# Set the layout to wide
	st.set_page_config(layout="wide")


	def prep_rankings_table(df, y_column):
	# Create a copy of the dataframe.
	df_copy = df.copy()

	# Select the columns we care about, sort by the y column, and reset the index.
	df_copy = (
	df_copy[
	[
	"model_name",
	y_column,
	"num_words_mean",
	]
	]
	.sort_values(y_column, ascending=False)
	.reset_index()
	)

	# Create a rank column.
	df_copy["rank"] = df_copy.index + 1

	# Round the y column.
	df_copy[y_column] = df_copy[y_column].round(2)

	# Fix the order.
	df_copy = df_copy[["rank", "model_name", y_column, "num_words_mean"]]
	return df_copy


	def app():
	st.title("AlpacaEval Visualizations")

	st.markdown("## Win rate vs. overall mean length")

	# Load the data
	df = pd.read_json("data/model_win_rates.json")

	# Add a model name column for hover labels
	df["model_name"] = df.index.astype(str)

	# Define the preset groups
	presets = {
	"gpt": df[df["model_name"].str.contains("openai\|gpt", case=False)][
	"model_name"
	].tolist(),
	"claude": df[df["model_name"].str.contains("claude", case=False)][
	"model_name"
	].tolist(),
	"moa": df[df["model_name"].str.contains("moa", case=False)][
	"model_name"
	].tolist(),
	"llama": df[df["model_name"].str.contains("llama", case=False)][
	"model_name"
	].tolist(),
	"custom": [],
	}

	# Add radio button for preset groups
	preset_selection = st.radio(
	"Select a preset group of models or choose 'custom' to select manually",
	options=["custom", "gpt", "claude", "moa", "llama"],
	)

	# Add multiselect for custom model selection
	if preset_selection == "custom":
	selected_models = st.multiselect(
	"Select models to highlight", options=df["model_name"].unique()
	)
	else:
	selected_models = presets[preset_selection]

	def create_scatter_plot(df, y_column, selected_models, title):
	fig = go.Figure()

	# Add scatter plots for num_words_mean and num_tokens_mean
	fig.add_trace(
	go.Scatter(
	x=df["num_words_mean"],
	y=df[y_column],
	mode="markers",
	name="words",
	text=df["model_name"],
	marker=dict(size=5, color="skyblue"),
	showlegend=True,
	visible="legendonly", # Make 'words' trace initially visible only in legend
	)
	)
	fig.add_trace(
	go.Scatter(
	x=df["num_tokens_mean"],
	y=df[y_column],
	mode="markers",
	name="tokens",
	text=df["model_name"],
	marker=dict(size=5, color="orange"),
	showlegend=True,
	)
	)

	# Highlight selected models
	if selected_models:
	selected_data = df[df["model_name"].isin(selected_models)]
	fig.add_trace(
	go.Scatter(
	x=selected_data["num_words_mean"],
	y=selected_data[y_column],
	mode="markers",
	name="selected words",
	text=selected_data["model_name"],
	marker=dict(size=10, color="blue"),
	showlegend=True,
	visible="legendonly", # Make 'selected words' trace initially visible only in legend
	)
	)
	fig.add_trace(
	go.Scatter(
	x=selected_data["num_tokens_mean"],
	y=selected_data[y_column],
	mode="markers",
	name="selected tokens",
	text=selected_data["model_name"],
	marker=dict(size=10, color="orangered"),
	showlegend=True,
	)
	)

	# Add trendlines
	def add_trendline(fig, x, y, name, color, visibility="legendonly"):
	X = sm.add_constant(df[x])
	model = sm.OLS(df[y], X).fit()
	trendline = model.predict(X)
	fig.add_trace(
	go.Scatter(
	x=df[x],
	y=trendline,
	mode="lines",
	name=f"{name} trendline",
	line=dict(color=color, width=2),
	visible=visibility, # Control the initial visibility
	)
	)
	return model.rsquared

	r_squared_words = add_trendline(
	fig, "num_words_mean", y_column, "words", "blue"
	)
	r_squared_tokens = add_trendline(
	fig, "num_tokens_mean", y_column, "tokens", "orangered", visibility=True
	)

	# Update layout with titles and labels
	fig.update_layout(
	xaxis_title="Mean length",
	yaxis_title=(
	"Win rate"
	if y_column == "win_rate"
	else (
	"LC Win Rate"
	if y_column == "length_controlled_winrate"
	else "Discrete Win Rate"
	)
	),
	title=title,
	legend_title="Legend",
	)

	return fig, r_squared_words, r_squared_tokens

	y_column1 = "length_controlled_winrate"
	y_column2 = "win_rate"
	y_column3 = "discrete_win_rate"

	fig1, r_squared_words_1, r_squared_tokens_1 = create_scatter_plot(
	df, y_column1, selected_models, "Length-Controlled Win Rate"
	)
	fig2, r_squared_words_2, r_squared_tokens_2 = create_scatter_plot(
	df, y_column2, selected_models, "Win Rate"
	)
	fig3, r_squared_words_3, r_squared_tokens_3 = create_scatter_plot(
	df, y_column3, selected_models, "Discrete Win Rate"
	)

	# Create tabs for each chart
	tab1, tab2, tab3 = st.tabs(["LC Win Rate", "Win Rate", "Discrete Win Rate"])

	with tab1:
	col1, col2 = st.columns([3, 2])
	col1.plotly_chart(fig1)
	col2.markdown("#### Rankings")
	prepped_df = prep_rankings_table(df, "length_controlled_winrate")
	col2.dataframe(
	prepped_df,
	hide_index=True,
	)
	with st.expander("Trendline R²"):
	st.markdown(
	f"- R² (Words vs {y_column1}): {r_squared_words_1:.2f} \n- R² (Tokens vs {y_column1}): {r_squared_tokens_1:.2f}"
	)

	with tab2:
	col1, col2 = st.columns([3, 2])
	col1.plotly_chart(fig2)
	col2.markdown("#### Rankings")
	prepped_df = prep_rankings_table(df, "win_rate")
	col2.dataframe(
	prepped_df,
	hide_index=True,
	)
	with st.expander("Trendline R²"):
	st.markdown(
	f"- R² (Words vs {y_column2}): {r_squared_words_2:.2f} \n- R² (Tokens vs {y_column2}): {r_squared_tokens_2:.2f}"
	)

	with tab3:
	col1, col2 = st.columns([3, 2])
	col1.plotly_chart(fig3)
	col2.markdown("#### Rankings")
	prepped_df = prep_rankings_table(df, "discrete_win_rate")
	col2.dataframe(
	prepped_df,
	hide_index=True,
	)
	with st.expander("Trendline R²"):
	st.markdown(
	f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}"
	)

	with st.expander("Raw data"):
	st.dataframe(df)


	if __name__ == "__main__":
	app()