
Track large files with Git LFS, and expand app to include a data explorer and more length-based visualizations.
707a231
import streamlit as st | |
import pandas as pd | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import statsmodels.api as sm | |
import random | |
# Set the layout to wide | |
st.set_page_config(layout="wide") | |
def prep_rankings_table(df, y_column): | |
# Create a copy of the dataframe. | |
df_copy = df.copy() | |
# Select the columns we care about, sort by the y column, and reset the index. | |
df_copy = ( | |
df_copy[ | |
[ | |
"model_name", | |
y_column, | |
"num_words_mean", | |
] | |
] | |
.sort_values(y_column, ascending=False) | |
.reset_index() | |
) | |
# Create a rank column. | |
df_copy["rank"] = df_copy.index + 1 | |
# Round the y column. | |
df_copy[y_column] = df_copy[y_column].round(2) | |
# Fix the order. | |
df_copy = df_copy[["rank", "model_name", y_column, "num_words_mean"]] | |
return df_copy | |
def get_preference(preference_score): | |
rounded_preference_score = int(preference_score.round(0).iloc[0]) | |
return get_preference_from_rounded_score(rounded_preference_score) | |
# if rounded_preference_score == 2: | |
# return "[2>1]" | |
# elif rounded_preference_score == 1: | |
# return "[1>2]" | |
def get_preference_from_rounded_score(score): | |
if score == 2: | |
return "[2>1]" | |
elif score == 1: | |
return "[1>2]" | |
return "[1=2]" | |
# raise ValueError(f"Invalid score: {score}") | |
def app(): | |
fixed_model = "gpt4_1106_preview" | |
# Ensure to initialize session state variables if they do not exist | |
if "selected_instruction" not in st.session_state: | |
st.session_state.selected_instruction = None | |
if "selected_model" not in st.session_state: | |
st.session_state.selected_model = "gpt4" | |
if "selected_judge" not in st.session_state: | |
st.session_state.selected_judge = None | |
if "selected_dataset" not in st.session_state: | |
st.session_state.selected_dataset = "NEW" | |
if "instruction_options" not in st.session_state: | |
st.session_state.instruction_options = [] | |
# Function to update the instruction options based on selected dataset | |
def update_instruction_options(): | |
selected_dataset = st.session_state.dataset_selector | |
if selected_dataset == "all" or selected_dataset == "NEW": | |
instruction_options = df_response_judging["instruction"].unique().tolist() | |
elif ( | |
selected_dataset == "None" | |
or selected_dataset is None | |
or str(selected_dataset) == "" | |
): | |
instruction_options = ( | |
df_response_judging[pd.isna(df_response_judging["dataset"])][ | |
"instruction" | |
] | |
.unique() | |
.tolist() | |
) | |
else: | |
instruction_options = ( | |
df_response_judging[df_response_judging["dataset"] == selected_dataset][ | |
"instruction" | |
] | |
.unique() | |
.tolist() | |
) | |
st.session_state.instruction_options = instruction_options | |
def update_instruction(): | |
st.session_state.selected_instruction = st.session_state.instruction_selector | |
def update_model(): | |
st.session_state.selected_model = st.session_state.model_selector | |
def update_judge(): | |
st.session_state.selected_judge = st.session_state.judge_selector | |
def randomize_selection(): | |
st.session_state.dataset_selector = random.choice( | |
["all"] + df_response_judging["dataset"].dropna().unique().tolist() | |
) | |
st.session_state.selected_model = random.choice(model_options) | |
update_instruction_options() | |
st.session_state.selected_instruction = random.choice( | |
st.session_state.instruction_options | |
) | |
st.title("AlpacaEval Visualizations") | |
outer_tabs = st.tabs(["Length bias in overall win rate", "Data explorer"]) | |
# Load the data | |
df = pd.read_json("data/model_win_rates.jsonl", lines=True, orient="records") | |
# df_responses = pd.read_json("data/df_responses.jsonl", lines=True, orient="records") | |
df_response_judging = pd.read_json( | |
"data/df_response_judging.jsonl", lines=True, orient="records" | |
) | |
# Prepare the model selector options | |
model_options = df_response_judging["generator_2"].unique().tolist() | |
with outer_tabs[0]: | |
# Define the preset groups | |
presets = { | |
"gpt": df[df["model_name"].str.contains("openai|gpt", case=False)][ | |
"model_name" | |
].tolist(), | |
"claude": df[df["model_name"].str.contains("claude", case=False)][ | |
"model_name" | |
].tolist(), | |
"moa": df[df["model_name"].str.contains("moa", case=False)][ | |
"model_name" | |
].tolist(), | |
"llama": df[df["model_name"].str.contains("llama", case=False)][ | |
"model_name" | |
].tolist(), | |
"custom": [], | |
} | |
# Add radio button for preset groups | |
preset_selection = st.radio( | |
"Select a preset group of models or choose 'custom' to select manually.", | |
options=["custom", "gpt", "claude", "moa", "llama"], | |
) | |
st.divider() | |
# Add multiselect for custom model selection | |
if preset_selection == "custom": | |
selected_models = st.multiselect( | |
"Select models to highlight", options=df["model_name"].unique() | |
) | |
else: | |
selected_models = presets[preset_selection] | |
def create_scatter_plot(df, y_column, selected_models, title): | |
fig = go.Figure() | |
# Add scatter plots for num_words_mean and num_tokens_mean | |
fig.add_trace( | |
go.Scatter( | |
x=df["num_words_mean"], | |
y=df[y_column], | |
mode="markers", | |
name="words", | |
text=df["model_name"], | |
marker=dict(size=5, color="skyblue"), | |
showlegend=True, | |
) | |
) | |
fig.add_trace( | |
go.Scatter( | |
x=df["num_tokens_mean"], | |
y=df[y_column], | |
mode="markers", | |
name="tokens", | |
text=df["model_name"], | |
marker=dict(size=5, color="orange"), | |
showlegend=True, | |
visible="legendonly", # Make 'words' trace initially visible only in legend | |
) | |
) | |
# Highlight selected models | |
if selected_models: | |
selected_data = df[df["model_name"].isin(selected_models)] | |
fig.add_trace( | |
go.Scatter( | |
x=selected_data["num_words_mean"], | |
y=selected_data[y_column], | |
mode="markers", | |
name="selected words", | |
text=selected_data["model_name"], | |
marker=dict(size=10, color="blue"), | |
showlegend=True, | |
) | |
) | |
fig.add_trace( | |
go.Scatter( | |
x=selected_data["num_tokens_mean"], | |
y=selected_data[y_column], | |
mode="markers", | |
name="selected tokens", | |
text=selected_data["model_name"], | |
marker=dict(size=10, color="orangered"), | |
showlegend=True, | |
visible="legendonly", # Make 'selected words' trace initially visible only in legend | |
) | |
) | |
# Add trendlines | |
def add_trendline(fig, x, y, name, color, visibility="legendonly"): | |
X = sm.add_constant(df[x]) | |
model = sm.OLS(df[y], X).fit() | |
trendline = model.predict(X) | |
fig.add_trace( | |
go.Scatter( | |
x=df[x], | |
y=trendline, | |
mode="lines", | |
name=f"{name} trendline", | |
line=dict(color=color, width=2), | |
visible=visibility, # Control the initial visibility | |
) | |
) | |
return model.rsquared | |
r_squared_words = add_trendline( | |
fig, "num_words_mean", y_column, "words", "blue", visibility=True | |
) | |
r_squared_tokens = add_trendline( | |
fig, "num_tokens_mean", y_column, "tokens", "orangered" | |
) | |
# Update layout with titles and labels | |
fig.update_layout( | |
xaxis_title="Mean length", | |
yaxis_title=( | |
"Win rate" | |
if y_column == "win_rate" | |
else ( | |
"LC Win Rate" | |
if y_column == "length_controlled_winrate" | |
else "Discrete Win Rate" | |
) | |
), | |
title=title, | |
legend_title="Legend", | |
) | |
return fig, r_squared_words, r_squared_tokens | |
st.markdown("## Overall win rate") | |
y_column1 = "length_controlled_winrate" | |
y_column2 = "win_rate" | |
y_column3 = "discrete_win_rate" | |
fig1, r_squared_words_1, r_squared_tokens_1 = create_scatter_plot( | |
df, y_column1, selected_models, "Length-Controlled Win Rate" | |
) | |
fig2, r_squared_words_2, r_squared_tokens_2 = create_scatter_plot( | |
df, y_column2, selected_models, "Win Rate" | |
) | |
fig3, r_squared_words_3, r_squared_tokens_3 = create_scatter_plot( | |
df, y_column3, selected_models, "Discrete Win Rate" | |
) | |
# Create tabs for each chart | |
tab1, tab2, tab3 = st.tabs(["LC Win Rate", "Win Rate", "Discrete Win Rate"]) | |
with tab1: | |
col1, col2 = st.columns([3, 2]) | |
col1.plotly_chart(fig1) | |
col2.markdown("#### Rankings") | |
prepped_df = prep_rankings_table(df, "length_controlled_winrate") | |
col2.dataframe( | |
prepped_df, | |
hide_index=True, | |
) | |
with st.expander("Trendline R²"): | |
st.markdown( | |
f"- R² (Words vs {y_column1}): {r_squared_words_1:.2f} \n- R² (Tokens vs {y_column1}): {r_squared_tokens_1:.2f}" | |
) | |
with tab2: | |
col1, col2 = st.columns([3, 2]) | |
col1.plotly_chart(fig2) | |
col2.markdown("#### Rankings") | |
prepped_df = prep_rankings_table(df, "win_rate") | |
col2.dataframe( | |
prepped_df, | |
hide_index=True, | |
) | |
with st.expander("Trendline R²"): | |
st.markdown( | |
f"- R² (Words vs {y_column2}): {r_squared_words_2:.2f} \n- R² (Tokens vs {y_column2}): {r_squared_tokens_2:.2f}" | |
) | |
with tab3: | |
col1, col2 = st.columns([3, 2]) | |
col1.plotly_chart(fig3) | |
col2.markdown("#### Rankings") | |
prepped_df = prep_rankings_table(df, "discrete_win_rate") | |
col2.dataframe( | |
prepped_df, | |
hide_index=True, | |
) | |
with st.expander("Trendline R²"): | |
st.markdown( | |
f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}" | |
) | |
st.markdown("## Length bias in battles") | |
df_response_judging_copy = df_response_judging.copy() | |
if not selected_models: | |
df_response_judging_copy["output_1_num_words"] = df_response_judging_copy[ | |
"output_1" | |
].apply(lambda x: len(x.split())) | |
df_response_judging_copy["output_2_num_words"] = df_response_judging_copy[ | |
"output_2" | |
].apply(lambda x: len(x.split())) | |
df_response_judging_copy["output_num_words_diff"] = ( | |
df_response_judging_copy["output_1_num_words"] | |
- df_response_judging_copy["output_2_num_words"] | |
) | |
df_response_judging_copy["assigned_preference"] = ( | |
df_response_judging_copy["preference"] | |
.round(0) | |
.apply(get_preference_from_rounded_score) | |
) | |
else: | |
df_response_judging_copy = df_response_judging_copy[ | |
df_response_judging_copy["generator_2"].isin(selected_models) | |
] | |
df_response_judging_copy["output_1_num_words"] = df_response_judging_copy[ | |
"output_1" | |
].apply(lambda x: len(x.split())) | |
df_response_judging_copy["output_2_num_words"] = df_response_judging_copy[ | |
"output_2" | |
].apply(lambda x: len(x.split())) | |
df_response_judging_copy["output_num_words_diff"] = ( | |
df_response_judging_copy["output_1_num_words"] | |
- df_response_judging_copy["output_2_num_words"] | |
) | |
df_response_judging_copy["assigned_preference"] = ( | |
df_response_judging_copy["preference"] | |
.round(0) | |
.apply(get_preference_from_rounded_score) | |
) | |
col1, col2 = st.columns(2) | |
fig = px.scatter( | |
df_response_judging_copy, | |
x="output_1_num_words", | |
y="output_2_num_words", | |
color="assigned_preference", | |
title=f"Pairwise preference based on response length", | |
labels={ | |
"output_1_num_words": f"{fixed_model} (1) number of words", | |
"output_2_num_words": "Target model (2) number of words", | |
}, | |
color_discrete_map={ | |
"[1>2]": "blue", | |
"[2>1]": "orangered", | |
"[1=2]": "green", | |
}, | |
) | |
col1.plotly_chart(fig) | |
# Plot of output_num_words_diff histogram, colored by assigned_preference. | |
fig = px.histogram( | |
df_response_judging_copy, | |
x="output_num_words_diff", | |
color="assigned_preference", | |
title=f"Pairwise preference counts based on difference in response length", | |
color_discrete_map={ | |
"[1>2]": "blue", | |
"[2>1]": "orangered", | |
"[1=2]": "green", | |
}, | |
range_x=[-500, 500], | |
labels={ | |
"output_num_words_diff": "Length difference in words between gpt4_1106_preview and target model" | |
}, | |
) | |
col2.plotly_chart(fig) | |
with st.expander("Raw data"): | |
st.dataframe(df) | |
# Data explorer | |
with outer_tabs[1]: | |
# Add randomize button at the top of the app | |
st.markdown("## Choose example") | |
st.button( | |
":game_die: Randomize!", | |
on_click=randomize_selection, | |
type="primary", | |
) | |
left_col, right_col = st.columns([1, 3]) | |
st.session_state.selected_dataset = left_col.selectbox( | |
"Select Dataset", | |
["all"] + df_response_judging["dataset"].dropna().unique().tolist(), | |
key="dataset_selector", | |
on_change=update_instruction_options, | |
) | |
update_instruction_options() | |
st.session_state.selected_instruction = right_col.selectbox( | |
f"Select Instruction ({len(st.session_state.instruction_options)} unique instructions)", | |
st.session_state.instruction_options, | |
key="instruction_selector", | |
on_change=update_instruction, | |
index=( | |
st.session_state.instruction_options.index( | |
st.session_state.selected_instruction | |
) | |
if st.session_state.selected_instruction | |
in st.session_state.instruction_options | |
else 0 | |
), | |
) | |
# All the models. | |
all_models_judgings_details = df_response_judging[ | |
(df_response_judging["generator_1"] == fixed_model) | |
& ( | |
df_response_judging["instruction"] | |
== st.session_state.selected_instruction | |
) | |
] | |
st.divider() | |
st.markdown(f"## Selected instruction") | |
st.info(st.session_state.selected_instruction) | |
st.divider() | |
st.markdown(f"## Overall Battles") | |
all_models_judgings_details["output_1_num_words"] = all_models_judgings_details[ | |
"output_1" | |
].apply(lambda x: len(x.split())) | |
all_models_judgings_details["output_2_num_words"] = all_models_judgings_details[ | |
"output_2" | |
].apply(lambda x: len(x.split())) | |
all_models_judgings_details["output_num_words_diff"] = ( | |
all_models_judgings_details["output_1_num_words"] | |
- all_models_judgings_details["output_2_num_words"] | |
) | |
all_models_judgings_details["assigned_preference"] = ( | |
all_models_judgings_details["preference"] | |
.round(0) | |
.apply(get_preference_from_rounded_score) | |
) | |
# st.write(all_models_judgings_details) | |
col1, col2, col3 = st.columns(3) | |
fig = px.histogram( | |
all_models_judgings_details, | |
x="output_num_words_diff", | |
color="assigned_preference", | |
title=f"Pairwise preference counts based on difference in response length", | |
color_discrete_map={ | |
"[1>2]": "blue", | |
"[2>1]": "orangered", | |
"[1=2]": "green", | |
}, | |
range_x=[-500, 500], | |
labels={ | |
"output_num_words_diff": "Difference in number of words between response 1 and 2.", | |
"assigned_preference": "Assigned Preference", | |
}, | |
) | |
col1.plotly_chart(fig) | |
# Plot of assigned preference counts. | |
fig = px.histogram( | |
all_models_judgings_details, | |
x="assigned_preference", | |
title=f"Assigned preferences for {fixed_model} vs. all models", | |
) | |
col2.plotly_chart(fig) | |
# Models that are better than the fixed model. | |
num_words_for_fixed_model = len( | |
all_models_judgings_details.iloc[0]["output_1"].split() | |
) | |
better_models = all_models_judgings_details[ | |
all_models_judgings_details["assigned_preference"] == "[2>1]" | |
] | |
shorter_models = better_models[ | |
better_models["output_2_num_words"] <= num_words_for_fixed_model | |
] | |
longer_models = better_models[ | |
better_models["output_2_num_words"] > num_words_for_fixed_model | |
] | |
col3.markdown( | |
f"### Models that are better than {fixed_model} ({num_words_for_fixed_model})" | |
) | |
if shorter_models.size != 0: | |
shorter_models_string = "" | |
for _, shorter_model in shorter_models.iterrows(): | |
if shorter_model["generator_2"] != fixed_model: | |
shorter_models_string += f"- {shorter_model['generator_2']} ({shorter_model['output_2_num_words']})\n" | |
col3.markdown("**With shorter or equal length responses:**") | |
col3.markdown(shorter_models_string) | |
else: | |
col3.write("None") | |
if longer_models.size != 0: | |
longer_models_string = "" | |
for _, longer_model in longer_models.iterrows(): | |
if longer_model["generator_2"] != fixed_model: | |
longer_models_string += f"- {longer_model['generator_2']} ({longer_model['output_2_num_words']})\n" | |
col3.markdown("**With longer responses:**") | |
col3.markdown(longer_models_string) | |
else: | |
col3.write("None") | |
# Judging details. | |
st.markdown(f"## Individual Battle Details") | |
judging_details = df_response_judging[ | |
(df_response_judging["generator_1"] == fixed_model) | |
& (df_response_judging["generator_2"] == st.session_state.selected_model) | |
& ( | |
df_response_judging["instruction"] | |
== st.session_state.selected_instruction | |
) | |
] | |
# if not judging_details.empty: | |
if not judging_details["preference"].empty: | |
preference = get_preference(judging_details["preference"]) | |
if preference == "[1>2]": | |
st.write( | |
f"**{fixed_model}** is better than **{st.session_state.selected_model}**" | |
) | |
else: | |
st.write( | |
f"**{st.session_state.selected_model}** is better than **{fixed_model}**" | |
) | |
st.write( | |
f"- **Score:** {judging_details['preference'].round(2).item()}\n- **Assigned preference:** {preference}" | |
) | |
with st.expander("Additional information"): | |
st.write( | |
judging_details[ | |
[ | |
"instruction", | |
"time_per_example", | |
"price_per_example", | |
"raw_completion", | |
] | |
] | |
) | |
# Create two columns for model selectors | |
st.markdown("## Responses") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.selectbox( | |
"Reference model", | |
[fixed_model], | |
key="fixed_model", | |
) | |
# Get the response string for the fixed model | |
if st.session_state.selected_instruction: | |
preference = get_preference(judging_details["preference"]) | |
response_details_fixed = df_response_judging[ | |
( | |
df_response_judging["instruction"] | |
== st.session_state.selected_instruction | |
) | |
& (df_response_judging["generator_1"] == fixed_model) | |
].iloc[0] | |
st.write( | |
f'Number of words: {len(response_details_fixed["output_1"].split())}' | |
) | |
# Display the response string | |
if preference == "[1>2]": | |
st.success(response_details_fixed["output_1"]) | |
else: | |
st.error(response_details_fixed["output_1"]) | |
with col2: | |
st.session_state.selected_model = st.selectbox( | |
"Select Model", | |
model_options, | |
key="model_selector", | |
on_change=update_model, | |
index=( | |
model_options.index(st.session_state.selected_model) | |
if st.session_state.selected_model | |
else 0 | |
), | |
) | |
# Get the response string for the selected model | |
if ( | |
st.session_state.selected_model | |
and st.session_state.selected_instruction | |
): | |
response_details_dynamic = df_response_judging[ | |
( | |
df_response_judging["instruction"] | |
== st.session_state.selected_instruction | |
) | |
& ( | |
df_response_judging["generator_2"] | |
== st.session_state.selected_model | |
) | |
].iloc[0] | |
st.write( | |
f'Number of words: {len(response_details_dynamic["output_2"].split())}' | |
) | |
# Display the response string | |
if preference == "[2>1]": | |
st.success(response_details_dynamic["output_2"]) | |
else: | |
st.error(response_details_dynamic["output_2"]) | |
if __name__ == "__main__": | |
app() | |