justinxzhao's picture
Track large files with Git LFS, and expand app to include a data explorer and more length-based visualizations.
707a231
raw
history blame
24.2 kB
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
import random
# Set the layout to wide
st.set_page_config(layout="wide")
def prep_rankings_table(df, y_column):
# Create a copy of the dataframe.
df_copy = df.copy()
# Select the columns we care about, sort by the y column, and reset the index.
df_copy = (
df_copy[
[
"model_name",
y_column,
"num_words_mean",
]
]
.sort_values(y_column, ascending=False)
.reset_index()
)
# Create a rank column.
df_copy["rank"] = df_copy.index + 1
# Round the y column.
df_copy[y_column] = df_copy[y_column].round(2)
# Fix the order.
df_copy = df_copy[["rank", "model_name", y_column, "num_words_mean"]]
return df_copy
def get_preference(preference_score):
rounded_preference_score = int(preference_score.round(0).iloc[0])
return get_preference_from_rounded_score(rounded_preference_score)
# if rounded_preference_score == 2:
# return "[2>1]"
# elif rounded_preference_score == 1:
# return "[1>2]"
def get_preference_from_rounded_score(score):
if score == 2:
return "[2>1]"
elif score == 1:
return "[1>2]"
return "[1=2]"
# raise ValueError(f"Invalid score: {score}")
def app():
fixed_model = "gpt4_1106_preview"
# Ensure to initialize session state variables if they do not exist
if "selected_instruction" not in st.session_state:
st.session_state.selected_instruction = None
if "selected_model" not in st.session_state:
st.session_state.selected_model = "gpt4"
if "selected_judge" not in st.session_state:
st.session_state.selected_judge = None
if "selected_dataset" not in st.session_state:
st.session_state.selected_dataset = "NEW"
if "instruction_options" not in st.session_state:
st.session_state.instruction_options = []
# Function to update the instruction options based on selected dataset
def update_instruction_options():
selected_dataset = st.session_state.dataset_selector
if selected_dataset == "all" or selected_dataset == "NEW":
instruction_options = df_response_judging["instruction"].unique().tolist()
elif (
selected_dataset == "None"
or selected_dataset is None
or str(selected_dataset) == ""
):
instruction_options = (
df_response_judging[pd.isna(df_response_judging["dataset"])][
"instruction"
]
.unique()
.tolist()
)
else:
instruction_options = (
df_response_judging[df_response_judging["dataset"] == selected_dataset][
"instruction"
]
.unique()
.tolist()
)
st.session_state.instruction_options = instruction_options
def update_instruction():
st.session_state.selected_instruction = st.session_state.instruction_selector
def update_model():
st.session_state.selected_model = st.session_state.model_selector
def update_judge():
st.session_state.selected_judge = st.session_state.judge_selector
def randomize_selection():
st.session_state.dataset_selector = random.choice(
["all"] + df_response_judging["dataset"].dropna().unique().tolist()
)
st.session_state.selected_model = random.choice(model_options)
update_instruction_options()
st.session_state.selected_instruction = random.choice(
st.session_state.instruction_options
)
st.title("AlpacaEval Visualizations")
outer_tabs = st.tabs(["Length bias in overall win rate", "Data explorer"])
# Load the data
df = pd.read_json("data/model_win_rates.jsonl", lines=True, orient="records")
# df_responses = pd.read_json("data/df_responses.jsonl", lines=True, orient="records")
df_response_judging = pd.read_json(
"data/df_response_judging.jsonl", lines=True, orient="records"
)
# Prepare the model selector options
model_options = df_response_judging["generator_2"].unique().tolist()
with outer_tabs[0]:
# Define the preset groups
presets = {
"gpt": df[df["model_name"].str.contains("openai|gpt", case=False)][
"model_name"
].tolist(),
"claude": df[df["model_name"].str.contains("claude", case=False)][
"model_name"
].tolist(),
"moa": df[df["model_name"].str.contains("moa", case=False)][
"model_name"
].tolist(),
"llama": df[df["model_name"].str.contains("llama", case=False)][
"model_name"
].tolist(),
"custom": [],
}
# Add radio button for preset groups
preset_selection = st.radio(
"Select a preset group of models or choose 'custom' to select manually.",
options=["custom", "gpt", "claude", "moa", "llama"],
)
st.divider()
# Add multiselect for custom model selection
if preset_selection == "custom":
selected_models = st.multiselect(
"Select models to highlight", options=df["model_name"].unique()
)
else:
selected_models = presets[preset_selection]
def create_scatter_plot(df, y_column, selected_models, title):
fig = go.Figure()
# Add scatter plots for num_words_mean and num_tokens_mean
fig.add_trace(
go.Scatter(
x=df["num_words_mean"],
y=df[y_column],
mode="markers",
name="words",
text=df["model_name"],
marker=dict(size=5, color="skyblue"),
showlegend=True,
)
)
fig.add_trace(
go.Scatter(
x=df["num_tokens_mean"],
y=df[y_column],
mode="markers",
name="tokens",
text=df["model_name"],
marker=dict(size=5, color="orange"),
showlegend=True,
visible="legendonly", # Make 'words' trace initially visible only in legend
)
)
# Highlight selected models
if selected_models:
selected_data = df[df["model_name"].isin(selected_models)]
fig.add_trace(
go.Scatter(
x=selected_data["num_words_mean"],
y=selected_data[y_column],
mode="markers",
name="selected words",
text=selected_data["model_name"],
marker=dict(size=10, color="blue"),
showlegend=True,
)
)
fig.add_trace(
go.Scatter(
x=selected_data["num_tokens_mean"],
y=selected_data[y_column],
mode="markers",
name="selected tokens",
text=selected_data["model_name"],
marker=dict(size=10, color="orangered"),
showlegend=True,
visible="legendonly", # Make 'selected words' trace initially visible only in legend
)
)
# Add trendlines
def add_trendline(fig, x, y, name, color, visibility="legendonly"):
X = sm.add_constant(df[x])
model = sm.OLS(df[y], X).fit()
trendline = model.predict(X)
fig.add_trace(
go.Scatter(
x=df[x],
y=trendline,
mode="lines",
name=f"{name} trendline",
line=dict(color=color, width=2),
visible=visibility, # Control the initial visibility
)
)
return model.rsquared
r_squared_words = add_trendline(
fig, "num_words_mean", y_column, "words", "blue", visibility=True
)
r_squared_tokens = add_trendline(
fig, "num_tokens_mean", y_column, "tokens", "orangered"
)
# Update layout with titles and labels
fig.update_layout(
xaxis_title="Mean length",
yaxis_title=(
"Win rate"
if y_column == "win_rate"
else (
"LC Win Rate"
if y_column == "length_controlled_winrate"
else "Discrete Win Rate"
)
),
title=title,
legend_title="Legend",
)
return fig, r_squared_words, r_squared_tokens
st.markdown("## Overall win rate")
y_column1 = "length_controlled_winrate"
y_column2 = "win_rate"
y_column3 = "discrete_win_rate"
fig1, r_squared_words_1, r_squared_tokens_1 = create_scatter_plot(
df, y_column1, selected_models, "Length-Controlled Win Rate"
)
fig2, r_squared_words_2, r_squared_tokens_2 = create_scatter_plot(
df, y_column2, selected_models, "Win Rate"
)
fig3, r_squared_words_3, r_squared_tokens_3 = create_scatter_plot(
df, y_column3, selected_models, "Discrete Win Rate"
)
# Create tabs for each chart
tab1, tab2, tab3 = st.tabs(["LC Win Rate", "Win Rate", "Discrete Win Rate"])
with tab1:
col1, col2 = st.columns([3, 2])
col1.plotly_chart(fig1)
col2.markdown("#### Rankings")
prepped_df = prep_rankings_table(df, "length_controlled_winrate")
col2.dataframe(
prepped_df,
hide_index=True,
)
with st.expander("Trendline R²"):
st.markdown(
f"- R² (Words vs {y_column1}): {r_squared_words_1:.2f} \n- R² (Tokens vs {y_column1}): {r_squared_tokens_1:.2f}"
)
with tab2:
col1, col2 = st.columns([3, 2])
col1.plotly_chart(fig2)
col2.markdown("#### Rankings")
prepped_df = prep_rankings_table(df, "win_rate")
col2.dataframe(
prepped_df,
hide_index=True,
)
with st.expander("Trendline R²"):
st.markdown(
f"- R² (Words vs {y_column2}): {r_squared_words_2:.2f} \n- R² (Tokens vs {y_column2}): {r_squared_tokens_2:.2f}"
)
with tab3:
col1, col2 = st.columns([3, 2])
col1.plotly_chart(fig3)
col2.markdown("#### Rankings")
prepped_df = prep_rankings_table(df, "discrete_win_rate")
col2.dataframe(
prepped_df,
hide_index=True,
)
with st.expander("Trendline R²"):
st.markdown(
f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}"
)
st.markdown("## Length bias in battles")
df_response_judging_copy = df_response_judging.copy()
if not selected_models:
df_response_judging_copy["output_1_num_words"] = df_response_judging_copy[
"output_1"
].apply(lambda x: len(x.split()))
df_response_judging_copy["output_2_num_words"] = df_response_judging_copy[
"output_2"
].apply(lambda x: len(x.split()))
df_response_judging_copy["output_num_words_diff"] = (
df_response_judging_copy["output_1_num_words"]
- df_response_judging_copy["output_2_num_words"]
)
df_response_judging_copy["assigned_preference"] = (
df_response_judging_copy["preference"]
.round(0)
.apply(get_preference_from_rounded_score)
)
else:
df_response_judging_copy = df_response_judging_copy[
df_response_judging_copy["generator_2"].isin(selected_models)
]
df_response_judging_copy["output_1_num_words"] = df_response_judging_copy[
"output_1"
].apply(lambda x: len(x.split()))
df_response_judging_copy["output_2_num_words"] = df_response_judging_copy[
"output_2"
].apply(lambda x: len(x.split()))
df_response_judging_copy["output_num_words_diff"] = (
df_response_judging_copy["output_1_num_words"]
- df_response_judging_copy["output_2_num_words"]
)
df_response_judging_copy["assigned_preference"] = (
df_response_judging_copy["preference"]
.round(0)
.apply(get_preference_from_rounded_score)
)
col1, col2 = st.columns(2)
fig = px.scatter(
df_response_judging_copy,
x="output_1_num_words",
y="output_2_num_words",
color="assigned_preference",
title=f"Pairwise preference based on response length",
labels={
"output_1_num_words": f"{fixed_model} (1) number of words",
"output_2_num_words": "Target model (2) number of words",
},
color_discrete_map={
"[1>2]": "blue",
"[2>1]": "orangered",
"[1=2]": "green",
},
)
col1.plotly_chart(fig)
# Plot of output_num_words_diff histogram, colored by assigned_preference.
fig = px.histogram(
df_response_judging_copy,
x="output_num_words_diff",
color="assigned_preference",
title=f"Pairwise preference counts based on difference in response length",
color_discrete_map={
"[1>2]": "blue",
"[2>1]": "orangered",
"[1=2]": "green",
},
range_x=[-500, 500],
labels={
"output_num_words_diff": "Length difference in words between gpt4_1106_preview and target model"
},
)
col2.plotly_chart(fig)
with st.expander("Raw data"):
st.dataframe(df)
# Data explorer
with outer_tabs[1]:
# Add randomize button at the top of the app
st.markdown("## Choose example")
st.button(
":game_die: Randomize!",
on_click=randomize_selection,
type="primary",
)
left_col, right_col = st.columns([1, 3])
st.session_state.selected_dataset = left_col.selectbox(
"Select Dataset",
["all"] + df_response_judging["dataset"].dropna().unique().tolist(),
key="dataset_selector",
on_change=update_instruction_options,
)
update_instruction_options()
st.session_state.selected_instruction = right_col.selectbox(
f"Select Instruction ({len(st.session_state.instruction_options)} unique instructions)",
st.session_state.instruction_options,
key="instruction_selector",
on_change=update_instruction,
index=(
st.session_state.instruction_options.index(
st.session_state.selected_instruction
)
if st.session_state.selected_instruction
in st.session_state.instruction_options
else 0
),
)
# All the models.
all_models_judgings_details = df_response_judging[
(df_response_judging["generator_1"] == fixed_model)
& (
df_response_judging["instruction"]
== st.session_state.selected_instruction
)
]
st.divider()
st.markdown(f"## Selected instruction")
st.info(st.session_state.selected_instruction)
st.divider()
st.markdown(f"## Overall Battles")
all_models_judgings_details["output_1_num_words"] = all_models_judgings_details[
"output_1"
].apply(lambda x: len(x.split()))
all_models_judgings_details["output_2_num_words"] = all_models_judgings_details[
"output_2"
].apply(lambda x: len(x.split()))
all_models_judgings_details["output_num_words_diff"] = (
all_models_judgings_details["output_1_num_words"]
- all_models_judgings_details["output_2_num_words"]
)
all_models_judgings_details["assigned_preference"] = (
all_models_judgings_details["preference"]
.round(0)
.apply(get_preference_from_rounded_score)
)
# st.write(all_models_judgings_details)
col1, col2, col3 = st.columns(3)
fig = px.histogram(
all_models_judgings_details,
x="output_num_words_diff",
color="assigned_preference",
title=f"Pairwise preference counts based on difference in response length",
color_discrete_map={
"[1>2]": "blue",
"[2>1]": "orangered",
"[1=2]": "green",
},
range_x=[-500, 500],
labels={
"output_num_words_diff": "Difference in number of words between response 1 and 2.",
"assigned_preference": "Assigned Preference",
},
)
col1.plotly_chart(fig)
# Plot of assigned preference counts.
fig = px.histogram(
all_models_judgings_details,
x="assigned_preference",
title=f"Assigned preferences for {fixed_model} vs. all models",
)
col2.plotly_chart(fig)
# Models that are better than the fixed model.
num_words_for_fixed_model = len(
all_models_judgings_details.iloc[0]["output_1"].split()
)
better_models = all_models_judgings_details[
all_models_judgings_details["assigned_preference"] == "[2>1]"
]
shorter_models = better_models[
better_models["output_2_num_words"] <= num_words_for_fixed_model
]
longer_models = better_models[
better_models["output_2_num_words"] > num_words_for_fixed_model
]
col3.markdown(
f"### Models that are better than {fixed_model} ({num_words_for_fixed_model})"
)
if shorter_models.size != 0:
shorter_models_string = ""
for _, shorter_model in shorter_models.iterrows():
if shorter_model["generator_2"] != fixed_model:
shorter_models_string += f"- {shorter_model['generator_2']} ({shorter_model['output_2_num_words']})\n"
col3.markdown("**With shorter or equal length responses:**")
col3.markdown(shorter_models_string)
else:
col3.write("None")
if longer_models.size != 0:
longer_models_string = ""
for _, longer_model in longer_models.iterrows():
if longer_model["generator_2"] != fixed_model:
longer_models_string += f"- {longer_model['generator_2']} ({longer_model['output_2_num_words']})\n"
col3.markdown("**With longer responses:**")
col3.markdown(longer_models_string)
else:
col3.write("None")
# Judging details.
st.markdown(f"## Individual Battle Details")
judging_details = df_response_judging[
(df_response_judging["generator_1"] == fixed_model)
& (df_response_judging["generator_2"] == st.session_state.selected_model)
& (
df_response_judging["instruction"]
== st.session_state.selected_instruction
)
]
# if not judging_details.empty:
if not judging_details["preference"].empty:
preference = get_preference(judging_details["preference"])
if preference == "[1>2]":
st.write(
f"**{fixed_model}** is better than **{st.session_state.selected_model}**"
)
else:
st.write(
f"**{st.session_state.selected_model}** is better than **{fixed_model}**"
)
st.write(
f"- **Score:** {judging_details['preference'].round(2).item()}\n- **Assigned preference:** {preference}"
)
with st.expander("Additional information"):
st.write(
judging_details[
[
"instruction",
"time_per_example",
"price_per_example",
"raw_completion",
]
]
)
# Create two columns for model selectors
st.markdown("## Responses")
col1, col2 = st.columns(2)
with col1:
st.selectbox(
"Reference model",
[fixed_model],
key="fixed_model",
)
# Get the response string for the fixed model
if st.session_state.selected_instruction:
preference = get_preference(judging_details["preference"])
response_details_fixed = df_response_judging[
(
df_response_judging["instruction"]
== st.session_state.selected_instruction
)
& (df_response_judging["generator_1"] == fixed_model)
].iloc[0]
st.write(
f'Number of words: {len(response_details_fixed["output_1"].split())}'
)
# Display the response string
if preference == "[1>2]":
st.success(response_details_fixed["output_1"])
else:
st.error(response_details_fixed["output_1"])
with col2:
st.session_state.selected_model = st.selectbox(
"Select Model",
model_options,
key="model_selector",
on_change=update_model,
index=(
model_options.index(st.session_state.selected_model)
if st.session_state.selected_model
else 0
),
)
# Get the response string for the selected model
if (
st.session_state.selected_model
and st.session_state.selected_instruction
):
response_details_dynamic = df_response_judging[
(
df_response_judging["instruction"]
== st.session_state.selected_instruction
)
& (
df_response_judging["generator_2"]
== st.session_state.selected_model
)
].iloc[0]
st.write(
f'Number of words: {len(response_details_dynamic["output_2"].split())}'
)
# Display the response string
if preference == "[2>1]":
st.success(response_details_dynamic["output_2"])
else:
st.error(response_details_dynamic["output_2"])
if __name__ == "__main__":
app()