Spaces:

llm-council
/

alpaca-eval-explorer

Running

App Files Files Community

alpaca-eval-explorer / app.py

justinxzhao

Track large files with Git LFS, and expand app to include a data explorer and more length-based visualizations.

707a231 about 1 year ago

raw

history blame

24.2 kB

	import streamlit as st
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import statsmodels.api as sm
	import random

	# Set the layout to wide
	st.set_page_config(layout="wide")


	def prep_rankings_table(df, y_column):
	# Create a copy of the dataframe.
	df_copy = df.copy()

	# Select the columns we care about, sort by the y column, and reset the index.
	df_copy = (
	df_copy[
	[
	"model_name",
	y_column,
	"num_words_mean",
	]
	]
	.sort_values(y_column, ascending=False)
	.reset_index()
	)

	# Create a rank column.
	df_copy["rank"] = df_copy.index + 1

	# Round the y column.
	df_copy[y_column] = df_copy[y_column].round(2)

	# Fix the order.
	df_copy = df_copy[["rank", "model_name", y_column, "num_words_mean"]]
	return df_copy


	def get_preference(preference_score):
	rounded_preference_score = int(preference_score.round(0).iloc[0])
	return get_preference_from_rounded_score(rounded_preference_score)
	# if rounded_preference_score == 2:
	# return "[2>1]"
	# elif rounded_preference_score == 1:
	# return "[1>2]"


	def get_preference_from_rounded_score(score):
	if score == 2:
	return "[2>1]"
	elif score == 1:
	return "[1>2]"
	return "[1=2]"
	# raise ValueError(f"Invalid score: {score}")


	def app():
	fixed_model = "gpt4_1106_preview"

	# Ensure to initialize session state variables if they do not exist
	if "selected_instruction" not in st.session_state:
	st.session_state.selected_instruction = None

	if "selected_model" not in st.session_state:
	st.session_state.selected_model = "gpt4"

	if "selected_judge" not in st.session_state:
	st.session_state.selected_judge = None

	if "selected_dataset" not in st.session_state:
	st.session_state.selected_dataset = "NEW"

	if "instruction_options" not in st.session_state:
	st.session_state.instruction_options = []

	# Function to update the instruction options based on selected dataset
	def update_instruction_options():
	selected_dataset = st.session_state.dataset_selector
	if selected_dataset == "all" or selected_dataset == "NEW":
	instruction_options = df_response_judging["instruction"].unique().tolist()
	elif (
	selected_dataset == "None"
	or selected_dataset is None
	or str(selected_dataset) == ""
	):
	instruction_options = (
	df_response_judging[pd.isna(df_response_judging["dataset"])][
	"instruction"
	]
	.unique()
	.tolist()
	)
	else:
	instruction_options = (
	df_response_judging[df_response_judging["dataset"] == selected_dataset][
	"instruction"
	]
	.unique()
	.tolist()
	)

	st.session_state.instruction_options = instruction_options

	def update_instruction():
	st.session_state.selected_instruction = st.session_state.instruction_selector

	def update_model():
	st.session_state.selected_model = st.session_state.model_selector

	def update_judge():
	st.session_state.selected_judge = st.session_state.judge_selector

	def randomize_selection():
	st.session_state.dataset_selector = random.choice(
	["all"] + df_response_judging["dataset"].dropna().unique().tolist()
	)
	st.session_state.selected_model = random.choice(model_options)
	update_instruction_options()
	st.session_state.selected_instruction = random.choice(
	st.session_state.instruction_options
	)

	st.title("AlpacaEval Visualizations")

	outer_tabs = st.tabs(["Length bias in overall win rate", "Data explorer"])

	# Load the data
	df = pd.read_json("data/model_win_rates.jsonl", lines=True, orient="records")
	# df_responses = pd.read_json("data/df_responses.jsonl", lines=True, orient="records")
	df_response_judging = pd.read_json(
	"data/df_response_judging.jsonl", lines=True, orient="records"
	)

	# Prepare the model selector options
	model_options = df_response_judging["generator_2"].unique().tolist()

	with outer_tabs[0]:
	# Define the preset groups
	presets = {
	"gpt": df[df["model_name"].str.contains("openai\|gpt", case=False)][
	"model_name"
	].tolist(),
	"claude": df[df["model_name"].str.contains("claude", case=False)][
	"model_name"
	].tolist(),
	"moa": df[df["model_name"].str.contains("moa", case=False)][
	"model_name"
	].tolist(),
	"llama": df[df["model_name"].str.contains("llama", case=False)][
	"model_name"
	].tolist(),
	"custom": [],
	}

	# Add radio button for preset groups
	preset_selection = st.radio(
	"Select a preset group of models or choose 'custom' to select manually.",
	options=["custom", "gpt", "claude", "moa", "llama"],
	)

	st.divider()

	# Add multiselect for custom model selection
	if preset_selection == "custom":
	selected_models = st.multiselect(
	"Select models to highlight", options=df["model_name"].unique()
	)
	else:
	selected_models = presets[preset_selection]

	def create_scatter_plot(df, y_column, selected_models, title):
	fig = go.Figure()

	# Add scatter plots for num_words_mean and num_tokens_mean
	fig.add_trace(
	go.Scatter(
	x=df["num_words_mean"],
	y=df[y_column],
	mode="markers",
	name="words",
	text=df["model_name"],
	marker=dict(size=5, color="skyblue"),
	showlegend=True,
	)
	)
	fig.add_trace(
	go.Scatter(
	x=df["num_tokens_mean"],
	y=df[y_column],
	mode="markers",
	name="tokens",
	text=df["model_name"],
	marker=dict(size=5, color="orange"),
	showlegend=True,
	visible="legendonly", # Make 'words' trace initially visible only in legend
	)
	)

	# Highlight selected models
	if selected_models:
	selected_data = df[df["model_name"].isin(selected_models)]
	fig.add_trace(
	go.Scatter(
	x=selected_data["num_words_mean"],
	y=selected_data[y_column],
	mode="markers",
	name="selected words",
	text=selected_data["model_name"],
	marker=dict(size=10, color="blue"),
	showlegend=True,
	)
	)
	fig.add_trace(
	go.Scatter(
	x=selected_data["num_tokens_mean"],
	y=selected_data[y_column],
	mode="markers",
	name="selected tokens",
	text=selected_data["model_name"],
	marker=dict(size=10, color="orangered"),
	showlegend=True,
	visible="legendonly", # Make 'selected words' trace initially visible only in legend
	)
	)

	# Add trendlines
	def add_trendline(fig, x, y, name, color, visibility="legendonly"):
	X = sm.add_constant(df[x])
	model = sm.OLS(df[y], X).fit()
	trendline = model.predict(X)
	fig.add_trace(
	go.Scatter(
	x=df[x],
	y=trendline,
	mode="lines",
	name=f"{name} trendline",
	line=dict(color=color, width=2),
	visible=visibility, # Control the initial visibility
	)
	)
	return model.rsquared

	r_squared_words = add_trendline(
	fig, "num_words_mean", y_column, "words", "blue", visibility=True
	)
	r_squared_tokens = add_trendline(
	fig, "num_tokens_mean", y_column, "tokens", "orangered"
	)

	# Update layout with titles and labels
	fig.update_layout(
	xaxis_title="Mean length",
	yaxis_title=(
	"Win rate"
	if y_column == "win_rate"
	else (
	"LC Win Rate"
	if y_column == "length_controlled_winrate"
	else "Discrete Win Rate"
	)
	),
	title=title,
	legend_title="Legend",
	)

	return fig, r_squared_words, r_squared_tokens

	st.markdown("## Overall win rate")
	y_column1 = "length_controlled_winrate"
	y_column2 = "win_rate"
	y_column3 = "discrete_win_rate"

	fig1, r_squared_words_1, r_squared_tokens_1 = create_scatter_plot(
	df, y_column1, selected_models, "Length-Controlled Win Rate"
	)
	fig2, r_squared_words_2, r_squared_tokens_2 = create_scatter_plot(
	df, y_column2, selected_models, "Win Rate"
	)
	fig3, r_squared_words_3, r_squared_tokens_3 = create_scatter_plot(
	df, y_column3, selected_models, "Discrete Win Rate"
	)

	# Create tabs for each chart
	tab1, tab2, tab3 = st.tabs(["LC Win Rate", "Win Rate", "Discrete Win Rate"])

	with tab1:
	col1, col2 = st.columns([3, 2])
	col1.plotly_chart(fig1)
	col2.markdown("#### Rankings")
	prepped_df = prep_rankings_table(df, "length_controlled_winrate")
	col2.dataframe(
	prepped_df,
	hide_index=True,
	)
	with st.expander("Trendline R²"):
	st.markdown(
	f"- R² (Words vs {y_column1}): {r_squared_words_1:.2f} \n- R² (Tokens vs {y_column1}): {r_squared_tokens_1:.2f}"
	)

	with tab2:
	col1, col2 = st.columns([3, 2])
	col1.plotly_chart(fig2)
	col2.markdown("#### Rankings")
	prepped_df = prep_rankings_table(df, "win_rate")
	col2.dataframe(
	prepped_df,
	hide_index=True,
	)
	with st.expander("Trendline R²"):
	st.markdown(
	f"- R² (Words vs {y_column2}): {r_squared_words_2:.2f} \n- R² (Tokens vs {y_column2}): {r_squared_tokens_2:.2f}"
	)

	with tab3:
	col1, col2 = st.columns([3, 2])
	col1.plotly_chart(fig3)
	col2.markdown("#### Rankings")
	prepped_df = prep_rankings_table(df, "discrete_win_rate")
	col2.dataframe(
	prepped_df,
	hide_index=True,
	)
	with st.expander("Trendline R²"):
	st.markdown(
	f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}"
	)

	st.markdown("## Length bias in battles")

	df_response_judging_copy = df_response_judging.copy()
	if not selected_models:
	df_response_judging_copy["output_1_num_words"] = df_response_judging_copy[
	"output_1"
	].apply(lambda x: len(x.split()))
	df_response_judging_copy["output_2_num_words"] = df_response_judging_copy[
	"output_2"
	].apply(lambda x: len(x.split()))
	df_response_judging_copy["output_num_words_diff"] = (
	df_response_judging_copy["output_1_num_words"]
	- df_response_judging_copy["output_2_num_words"]
	)
	df_response_judging_copy["assigned_preference"] = (
	df_response_judging_copy["preference"]
	.round(0)
	.apply(get_preference_from_rounded_score)
	)
	else:
	df_response_judging_copy = df_response_judging_copy[
	df_response_judging_copy["generator_2"].isin(selected_models)
	]
	df_response_judging_copy["output_1_num_words"] = df_response_judging_copy[
	"output_1"
	].apply(lambda x: len(x.split()))
	df_response_judging_copy["output_2_num_words"] = df_response_judging_copy[
	"output_2"
	].apply(lambda x: len(x.split()))
	df_response_judging_copy["output_num_words_diff"] = (
	df_response_judging_copy["output_1_num_words"]
	- df_response_judging_copy["output_2_num_words"]
	)
	df_response_judging_copy["assigned_preference"] = (
	df_response_judging_copy["preference"]
	.round(0)
	.apply(get_preference_from_rounded_score)
	)

	col1, col2 = st.columns(2)
	fig = px.scatter(
	df_response_judging_copy,
	x="output_1_num_words",
	y="output_2_num_words",
	color="assigned_preference",
	title=f"Pairwise preference based on response length",
	labels={
	"output_1_num_words": f"{fixed_model} (1) number of words",
	"output_2_num_words": "Target model (2) number of words",
	},
	color_discrete_map={
	"[1>2]": "blue",
	"[2>1]": "orangered",
	"[1=2]": "green",
	},
	)
	col1.plotly_chart(fig)

	# Plot of output_num_words_diff histogram, colored by assigned_preference.
	fig = px.histogram(
	df_response_judging_copy,
	x="output_num_words_diff",
	color="assigned_preference",
	title=f"Pairwise preference counts based on difference in response length",
	color_discrete_map={
	"[1>2]": "blue",
	"[2>1]": "orangered",
	"[1=2]": "green",
	},
	range_x=[-500, 500],
	labels={
	"output_num_words_diff": "Length difference in words between gpt4_1106_preview and target model"
	},
	)
	col2.plotly_chart(fig)

	with st.expander("Raw data"):
	st.dataframe(df)

	# Data explorer
	with outer_tabs[1]:
	# Add randomize button at the top of the app
	st.markdown("## Choose example")
	st.button(
	":game_die: Randomize!",
	on_click=randomize_selection,
	type="primary",
	)

	left_col, right_col = st.columns([1, 3])

	st.session_state.selected_dataset = left_col.selectbox(
	"Select Dataset",
	["all"] + df_response_judging["dataset"].dropna().unique().tolist(),
	key="dataset_selector",
	on_change=update_instruction_options,
	)
	update_instruction_options()
	st.session_state.selected_instruction = right_col.selectbox(
	f"Select Instruction ({len(st.session_state.instruction_options)} unique instructions)",
	st.session_state.instruction_options,
	key="instruction_selector",
	on_change=update_instruction,
	index=(
	st.session_state.instruction_options.index(
	st.session_state.selected_instruction
	)
	if st.session_state.selected_instruction
	in st.session_state.instruction_options
	else 0
	),
	)

	# All the models.
	all_models_judgings_details = df_response_judging[
	(df_response_judging["generator_1"] == fixed_model)
	& (
	df_response_judging["instruction"]
	== st.session_state.selected_instruction
	)
	]

	st.divider()

	st.markdown(f"## Selected instruction")
	st.info(st.session_state.selected_instruction)

	st.divider()

	st.markdown(f"## Overall Battles")
	all_models_judgings_details["output_1_num_words"] = all_models_judgings_details[
	"output_1"
	].apply(lambda x: len(x.split()))
	all_models_judgings_details["output_2_num_words"] = all_models_judgings_details[
	"output_2"
	].apply(lambda x: len(x.split()))
	all_models_judgings_details["output_num_words_diff"] = (
	all_models_judgings_details["output_1_num_words"]
	- all_models_judgings_details["output_2_num_words"]
	)
	all_models_judgings_details["assigned_preference"] = (
	all_models_judgings_details["preference"]
	.round(0)
	.apply(get_preference_from_rounded_score)
	)

	# st.write(all_models_judgings_details)

	col1, col2, col3 = st.columns(3)

	fig = px.histogram(
	all_models_judgings_details,
	x="output_num_words_diff",
	color="assigned_preference",
	title=f"Pairwise preference counts based on difference in response length",
	color_discrete_map={
	"[1>2]": "blue",
	"[2>1]": "orangered",
	"[1=2]": "green",
	},
	range_x=[-500, 500],
	labels={
	"output_num_words_diff": "Difference in number of words between response 1 and 2.",
	"assigned_preference": "Assigned Preference",
	},
	)
	col1.plotly_chart(fig)

	# Plot of assigned preference counts.
	fig = px.histogram(
	all_models_judgings_details,
	x="assigned_preference",
	title=f"Assigned preferences for {fixed_model} vs. all models",
	)
	col2.plotly_chart(fig)

	# Models that are better than the fixed model.
	num_words_for_fixed_model = len(
	all_models_judgings_details.iloc[0]["output_1"].split()
	)
	better_models = all_models_judgings_details[
	all_models_judgings_details["assigned_preference"] == "[2>1]"
	]

	shorter_models = better_models[
	better_models["output_2_num_words"] <= num_words_for_fixed_model
	]
	longer_models = better_models[
	better_models["output_2_num_words"] > num_words_for_fixed_model
	]
	col3.markdown(
	f"### Models that are better than {fixed_model} ({num_words_for_fixed_model})"
	)
	if shorter_models.size != 0:
	shorter_models_string = ""
	for _, shorter_model in shorter_models.iterrows():
	if shorter_model["generator_2"] != fixed_model:
	shorter_models_string += f"- {shorter_model['generator_2']} ({shorter_model['output_2_num_words']})\n"
	col3.markdown("With shorter or equal length responses:")
	col3.markdown(shorter_models_string)
	else:
	col3.write("None")
	if longer_models.size != 0:
	longer_models_string = ""
	for _, longer_model in longer_models.iterrows():
	if longer_model["generator_2"] != fixed_model:
	longer_models_string += f"- {longer_model['generator_2']} ({longer_model['output_2_num_words']})\n"
	col3.markdown("With longer responses:")
	col3.markdown(longer_models_string)
	else:
	col3.write("None")

	# Judging details.
	st.markdown(f"## Individual Battle Details")
	judging_details = df_response_judging[
	(df_response_judging["generator_1"] == fixed_model)
	& (df_response_judging["generator_2"] == st.session_state.selected_model)
	& (
	df_response_judging["instruction"]
	== st.session_state.selected_instruction
	)
	]

	# if not judging_details.empty:
	if not judging_details["preference"].empty:
	preference = get_preference(judging_details["preference"])
	if preference == "[1>2]":
	st.write(
	f"{fixed_model} is better than {st.session_state.selected_model}"
	)
	else:
	st.write(
	f"{st.session_state.selected_model} is better than {fixed_model}"
	)
	st.write(
	f"- Score: {judging_details['preference'].round(2).item()}\n- Assigned preference: {preference}"
	)

	with st.expander("Additional information"):
	st.write(
	judging_details[
	[
	"instruction",
	"time_per_example",
	"price_per_example",
	"raw_completion",
	]
	]
	)

	# Create two columns for model selectors
	st.markdown("## Responses")
	col1, col2 = st.columns(2)

	with col1:
	st.selectbox(
	"Reference model",
	[fixed_model],
	key="fixed_model",
	)

	# Get the response string for the fixed model
	if st.session_state.selected_instruction:
	preference = get_preference(judging_details["preference"])
	response_details_fixed = df_response_judging[
	(
	df_response_judging["instruction"]
	== st.session_state.selected_instruction
	)
	& (df_response_judging["generator_1"] == fixed_model)
	].iloc[0]

	st.write(
	f'Number of words: {len(response_details_fixed["output_1"].split())}'
	)

	# Display the response string
	if preference == "[1>2]":
	st.success(response_details_fixed["output_1"])
	else:
	st.error(response_details_fixed["output_1"])

	with col2:
	st.session_state.selected_model = st.selectbox(
	"Select Model",
	model_options,
	key="model_selector",
	on_change=update_model,
	index=(
	model_options.index(st.session_state.selected_model)
	if st.session_state.selected_model
	else 0
	),
	)

	# Get the response string for the selected model
	if (
	st.session_state.selected_model
	and st.session_state.selected_instruction
	):
	response_details_dynamic = df_response_judging[
	(
	df_response_judging["instruction"]
	== st.session_state.selected_instruction
	)
	& (
	df_response_judging["generator_2"]
	== st.session_state.selected_model
	)
	].iloc[0]

	st.write(
	f'Number of words: {len(response_details_dynamic["output_2"].split())}'
	)

	# Display the response string
	if preference == "[2>1]":
	st.success(response_details_dynamic["output_2"])
	else:
	st.error(response_details_dynamic["output_2"])


	if __name__ == "__main__":
	app()