|
import streamlit as st |
|
import pandas as pd |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
import statsmodels.api as sm |
|
|
|
|
|
st.set_page_config(layout="wide") |
|
|
|
|
|
def prep_rankings_table(df, y_column): |
|
|
|
df_copy = df.copy() |
|
|
|
|
|
df_copy = ( |
|
df_copy[ |
|
[ |
|
"model_name", |
|
y_column, |
|
"num_words_mean", |
|
] |
|
] |
|
.sort_values(y_column, ascending=False) |
|
.reset_index() |
|
) |
|
|
|
|
|
df_copy["rank"] = df_copy.index + 1 |
|
|
|
|
|
df_copy[y_column] = df_copy[y_column].round(2) |
|
|
|
|
|
df_copy = df_copy[["rank", "model_name", y_column, "num_words_mean"]] |
|
return df_copy |
|
|
|
|
|
def app(): |
|
st.title("AlpacaEval Visualizations") |
|
|
|
st.markdown("## Win rate vs. overall mean length") |
|
|
|
|
|
df = pd.read_json("data/model_win_rates.json") |
|
|
|
|
|
df["model_name"] = df.index.astype(str) |
|
|
|
|
|
presets = { |
|
"gpt": df[df["model_name"].str.contains("openai|gpt", case=False)][ |
|
"model_name" |
|
].tolist(), |
|
"claude": df[df["model_name"].str.contains("claude", case=False)][ |
|
"model_name" |
|
].tolist(), |
|
"moa": df[df["model_name"].str.contains("moa", case=False)][ |
|
"model_name" |
|
].tolist(), |
|
"llama": df[df["model_name"].str.contains("llama", case=False)][ |
|
"model_name" |
|
].tolist(), |
|
"custom": [], |
|
} |
|
|
|
|
|
preset_selection = st.radio( |
|
"Select a preset group of models or choose 'custom' to select manually", |
|
options=["custom", "gpt", "claude", "moa", "llama"], |
|
) |
|
|
|
|
|
if preset_selection == "custom": |
|
selected_models = st.multiselect( |
|
"Select models to highlight", options=df["model_name"].unique() |
|
) |
|
else: |
|
selected_models = presets[preset_selection] |
|
|
|
def create_scatter_plot(df, y_column, selected_models, title): |
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace( |
|
go.Scatter( |
|
x=df["num_words_mean"], |
|
y=df[y_column], |
|
mode="markers", |
|
name="words", |
|
text=df["model_name"], |
|
marker=dict(size=5, color="skyblue"), |
|
showlegend=True, |
|
visible="legendonly", |
|
) |
|
) |
|
fig.add_trace( |
|
go.Scatter( |
|
x=df["num_tokens_mean"], |
|
y=df[y_column], |
|
mode="markers", |
|
name="tokens", |
|
text=df["model_name"], |
|
marker=dict(size=5, color="orange"), |
|
showlegend=True, |
|
) |
|
) |
|
|
|
|
|
if selected_models: |
|
selected_data = df[df["model_name"].isin(selected_models)] |
|
fig.add_trace( |
|
go.Scatter( |
|
x=selected_data["num_words_mean"], |
|
y=selected_data[y_column], |
|
mode="markers", |
|
name="selected words", |
|
text=selected_data["model_name"], |
|
marker=dict(size=10, color="blue"), |
|
showlegend=True, |
|
visible="legendonly", |
|
) |
|
) |
|
fig.add_trace( |
|
go.Scatter( |
|
x=selected_data["num_tokens_mean"], |
|
y=selected_data[y_column], |
|
mode="markers", |
|
name="selected tokens", |
|
text=selected_data["model_name"], |
|
marker=dict(size=10, color="orangered"), |
|
showlegend=True, |
|
) |
|
) |
|
|
|
|
|
def add_trendline(fig, x, y, name, color, visibility="legendonly"): |
|
X = sm.add_constant(df[x]) |
|
model = sm.OLS(df[y], X).fit() |
|
trendline = model.predict(X) |
|
fig.add_trace( |
|
go.Scatter( |
|
x=df[x], |
|
y=trendline, |
|
mode="lines", |
|
name=f"{name} trendline", |
|
line=dict(color=color, width=2), |
|
visible=visibility, |
|
) |
|
) |
|
return model.rsquared |
|
|
|
r_squared_words = add_trendline( |
|
fig, "num_words_mean", y_column, "words", "blue" |
|
) |
|
r_squared_tokens = add_trendline( |
|
fig, "num_tokens_mean", y_column, "tokens", "orangered", visibility=True |
|
) |
|
|
|
|
|
fig.update_layout( |
|
xaxis_title="Mean length", |
|
yaxis_title=( |
|
"Win rate" |
|
if y_column == "win_rate" |
|
else ( |
|
"LC Win Rate" |
|
if y_column == "length_controlled_winrate" |
|
else "Discrete Win Rate" |
|
) |
|
), |
|
title=title, |
|
legend_title="Legend", |
|
) |
|
|
|
return fig, r_squared_words, r_squared_tokens |
|
|
|
y_column1 = "length_controlled_winrate" |
|
y_column2 = "win_rate" |
|
y_column3 = "discrete_win_rate" |
|
|
|
fig1, r_squared_words_1, r_squared_tokens_1 = create_scatter_plot( |
|
df, y_column1, selected_models, "Length-Controlled Win Rate" |
|
) |
|
fig2, r_squared_words_2, r_squared_tokens_2 = create_scatter_plot( |
|
df, y_column2, selected_models, "Win Rate" |
|
) |
|
fig3, r_squared_words_3, r_squared_tokens_3 = create_scatter_plot( |
|
df, y_column3, selected_models, "Discrete Win Rate" |
|
) |
|
|
|
|
|
tab1, tab2, tab3 = st.tabs(["LC Win Rate", "Win Rate", "Discrete Win Rate"]) |
|
|
|
with tab1: |
|
col1, col2 = st.columns([3, 2]) |
|
col1.plotly_chart(fig1) |
|
col2.markdown("#### Rankings") |
|
prepped_df = prep_rankings_table(df, "length_controlled_winrate") |
|
col2.dataframe( |
|
prepped_df, |
|
hide_index=True, |
|
) |
|
with st.expander("Trendline R²"): |
|
st.markdown( |
|
f"- R² (Words vs {y_column1}): {r_squared_words_1:.2f} \n- R² (Tokens vs {y_column1}): {r_squared_tokens_1:.2f}" |
|
) |
|
|
|
with tab2: |
|
col1, col2 = st.columns([3, 2]) |
|
col1.plotly_chart(fig2) |
|
col2.markdown("#### Rankings") |
|
prepped_df = prep_rankings_table(df, "win_rate") |
|
col2.dataframe( |
|
prepped_df, |
|
hide_index=True, |
|
) |
|
with st.expander("Trendline R²"): |
|
st.markdown( |
|
f"- R² (Words vs {y_column2}): {r_squared_words_2:.2f} \n- R² (Tokens vs {y_column2}): {r_squared_tokens_2:.2f}" |
|
) |
|
|
|
with tab3: |
|
col1, col2 = st.columns([3, 2]) |
|
col1.plotly_chart(fig3) |
|
col2.markdown("#### Rankings") |
|
prepped_df = prep_rankings_table(df, "discrete_win_rate") |
|
col2.dataframe( |
|
prepped_df, |
|
hide_index=True, |
|
) |
|
with st.expander("Trendline R²"): |
|
st.markdown( |
|
f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}" |
|
) |
|
|
|
with st.expander("Raw data"): |
|
st.dataframe(df) |
|
|
|
|
|
if __name__ == "__main__": |
|
app() |
|
|