Spaces:

llm-council
/

alpaca-eval-explorer

Running

File size: 7,810 Bytes

import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm

# Set the layout to wide
st.set_page_config(layout="wide")


def prep_rankings_table(df, y_column):
    # Create a copy of the dataframe.
    df_copy = df.copy()

    # Select the columns we care about, sort by the y column, and reset the index.
    df_copy = (
        df_copy[
            [
                "model_name",
                y_column,
                "num_words_mean",
            ]
        ]
        .sort_values(y_column, ascending=False)
        .reset_index()
    )

    # Create a rank column.
    df_copy["rank"] = df_copy.index + 1

    # Round the y column.
    df_copy[y_column] = df_copy[y_column].round(2)

    # Fix the order.
    df_copy = df_copy[["rank", "model_name", y_column, "num_words_mean"]]
    return df_copy


def app():
    st.title("AlpacaEval Visualizations")

    st.markdown("## Win rate vs. overall mean length")

    # Load the data
    df = pd.read_json("data/model_win_rates.json")

    # Add a model name column for hover labels
    df["model_name"] = df.index.astype(str)

    # Define the preset groups
    presets = {
        "gpt": df[df["model_name"].str.contains("openai|gpt", case=False)][
            "model_name"
        ].tolist(),
        "claude": df[df["model_name"].str.contains("claude", case=False)][
            "model_name"
        ].tolist(),
        "moa": df[df["model_name"].str.contains("moa", case=False)][
            "model_name"
        ].tolist(),
        "llama": df[df["model_name"].str.contains("llama", case=False)][
            "model_name"
        ].tolist(),
        "custom": [],
    }

    # Add radio button for preset groups
    preset_selection = st.radio(
        "Select a preset group of models or choose 'custom' to select manually",
        options=["custom", "gpt", "claude", "moa", "llama"],
    )

    # Add multiselect for custom model selection
    if preset_selection == "custom":
        selected_models = st.multiselect(
            "Select models to highlight", options=df["model_name"].unique()
        )
    else:
        selected_models = presets[preset_selection]

    def create_scatter_plot(df, y_column, selected_models, title):
        fig = go.Figure()

        # Add scatter plots for num_words_mean and num_tokens_mean
        fig.add_trace(
            go.Scatter(
                x=df["num_words_mean"],
                y=df[y_column],
                mode="markers",
                name="words",
                text=df["model_name"],
                marker=dict(size=5, color="skyblue"),
                showlegend=True,
                visible="legendonly",  # Make 'words' trace initially visible only in legend
            )
        )
        fig.add_trace(
            go.Scatter(
                x=df["num_tokens_mean"],
                y=df[y_column],
                mode="markers",
                name="tokens",
                text=df["model_name"],
                marker=dict(size=5, color="orange"),
                showlegend=True,
            )
        )

        # Highlight selected models
        if selected_models:
            selected_data = df[df["model_name"].isin(selected_models)]
            fig.add_trace(
                go.Scatter(
                    x=selected_data["num_words_mean"],
                    y=selected_data[y_column],
                    mode="markers",
                    name="selected words",
                    text=selected_data["model_name"],
                    marker=dict(size=10, color="blue"),
                    showlegend=True,
                    visible="legendonly",  # Make 'selected words' trace initially visible only in legend
                )
            )
            fig.add_trace(
                go.Scatter(
                    x=selected_data["num_tokens_mean"],
                    y=selected_data[y_column],
                    mode="markers",
                    name="selected tokens",
                    text=selected_data["model_name"],
                    marker=dict(size=10, color="orangered"),
                    showlegend=True,
                )
            )

        # Add trendlines
        def add_trendline(fig, x, y, name, color, visibility="legendonly"):
            X = sm.add_constant(df[x])
            model = sm.OLS(df[y], X).fit()
            trendline = model.predict(X)
            fig.add_trace(
                go.Scatter(
                    x=df[x],
                    y=trendline,
                    mode="lines",
                    name=f"{name} trendline",
                    line=dict(color=color, width=2),
                    visible=visibility,  # Control the initial visibility
                )
            )
            return model.rsquared

        r_squared_words = add_trendline(
            fig, "num_words_mean", y_column, "words", "blue"
        )
        r_squared_tokens = add_trendline(
            fig, "num_tokens_mean", y_column, "tokens", "orangered", visibility=True
        )

        # Update layout with titles and labels
        fig.update_layout(
            xaxis_title="Mean length",
            yaxis_title=(
                "Win rate"
                if y_column == "win_rate"
                else (
                    "LC Win Rate"
                    if y_column == "length_controlled_winrate"
                    else "Discrete Win Rate"
                )
            ),
            title=title,
            legend_title="Legend",
        )

        return fig, r_squared_words, r_squared_tokens

    y_column1 = "length_controlled_winrate"
    y_column2 = "win_rate"
    y_column3 = "discrete_win_rate"

    fig1, r_squared_words_1, r_squared_tokens_1 = create_scatter_plot(
        df, y_column1, selected_models, "Length-Controlled Win Rate"
    )
    fig2, r_squared_words_2, r_squared_tokens_2 = create_scatter_plot(
        df, y_column2, selected_models, "Win Rate"
    )
    fig3, r_squared_words_3, r_squared_tokens_3 = create_scatter_plot(
        df, y_column3, selected_models, "Discrete Win Rate"
    )

    # Create tabs for each chart
    tab1, tab2, tab3 = st.tabs(["LC Win Rate", "Win Rate", "Discrete Win Rate"])

    with tab1:
        col1, col2 = st.columns([3, 2])
        col1.plotly_chart(fig1)
        col2.markdown("#### Rankings")
        prepped_df = prep_rankings_table(df, "length_controlled_winrate")
        col2.dataframe(
            prepped_df,
            hide_index=True,
        )
        with st.expander("Trendline R²"):
            st.markdown(
                f"- R² (Words vs {y_column1}): {r_squared_words_1:.2f} \n- R² (Tokens vs {y_column1}): {r_squared_tokens_1:.2f}"
            )

    with tab2:
        col1, col2 = st.columns([3, 2])
        col1.plotly_chart(fig2)
        col2.markdown("#### Rankings")
        prepped_df = prep_rankings_table(df, "win_rate")
        col2.dataframe(
            prepped_df,
            hide_index=True,
        )
        with st.expander("Trendline R²"):
            st.markdown(
                f"- R² (Words vs {y_column2}): {r_squared_words_2:.2f} \n- R² (Tokens vs {y_column2}): {r_squared_tokens_2:.2f}"
            )

    with tab3:
        col1, col2 = st.columns([3, 2])
        col1.plotly_chart(fig3)
        col2.markdown("#### Rankings")
        prepped_df = prep_rankings_table(df, "discrete_win_rate")
        col2.dataframe(
            prepped_df,
            hide_index=True,
        )
        with st.expander("Trendline R²"):
            st.markdown(
                f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}"
            )

    with st.expander("Raw data"):
        st.dataframe(df)


if __name__ == "__main__":
    app()