Spaces:

llm-council
/

alpaca-eval-explorer

Running

File size: 24,173 Bytes

import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
import random

# Set the layout to wide
st.set_page_config(layout="wide")


def prep_rankings_table(df, y_column):
    # Create a copy of the dataframe.
    df_copy = df.copy()

    # Select the columns we care about, sort by the y column, and reset the index.
    df_copy = (
        df_copy[
            [
                "model_name",
                y_column,
                "num_words_mean",
            ]
        ]
        .sort_values(y_column, ascending=False)
        .reset_index()
    )

    # Create a rank column.
    df_copy["rank"] = df_copy.index + 1

    # Round the y column.
    df_copy[y_column] = df_copy[y_column].round(2)

    # Fix the order.
    df_copy = df_copy[["rank", "model_name", y_column, "num_words_mean"]]
    return df_copy


def get_preference(preference_score):
    rounded_preference_score = int(preference_score.round(0).iloc[0])
    return get_preference_from_rounded_score(rounded_preference_score)
    # if rounded_preference_score == 2:
    #     return "[2>1]"
    # elif rounded_preference_score == 1:
    #     return "[1>2]"


def get_preference_from_rounded_score(score):
    if score == 2:
        return "[2>1]"
    elif score == 1:
        return "[1>2]"
    return "[1=2]"
    # raise ValueError(f"Invalid score: {score}")


def app():
    fixed_model = "gpt4_1106_preview"

    # Ensure to initialize session state variables if they do not exist
    if "selected_instruction" not in st.session_state:
        st.session_state.selected_instruction = None

    if "selected_model" not in st.session_state:
        st.session_state.selected_model = "gpt4"

    if "selected_judge" not in st.session_state:
        st.session_state.selected_judge = None

    if "selected_dataset" not in st.session_state:
        st.session_state.selected_dataset = "NEW"

    if "instruction_options" not in st.session_state:
        st.session_state.instruction_options = []

    # Function to update the instruction options based on selected dataset
    def update_instruction_options():
        selected_dataset = st.session_state.dataset_selector
        if selected_dataset == "all" or selected_dataset == "NEW":
            instruction_options = df_response_judging["instruction"].unique().tolist()
        elif (
            selected_dataset == "None"
            or selected_dataset is None
            or str(selected_dataset) == ""
        ):
            instruction_options = (
                df_response_judging[pd.isna(df_response_judging["dataset"])][
                    "instruction"
                ]
                .unique()
                .tolist()
            )
        else:
            instruction_options = (
                df_response_judging[df_response_judging["dataset"] == selected_dataset][
                    "instruction"
                ]
                .unique()
                .tolist()
            )

        st.session_state.instruction_options = instruction_options

    def update_instruction():
        st.session_state.selected_instruction = st.session_state.instruction_selector

    def update_model():
        st.session_state.selected_model = st.session_state.model_selector

    def update_judge():
        st.session_state.selected_judge = st.session_state.judge_selector

    def randomize_selection():
        st.session_state.dataset_selector = random.choice(
            ["all"] + df_response_judging["dataset"].dropna().unique().tolist()
        )
        st.session_state.selected_model = random.choice(model_options)
        update_instruction_options()
        st.session_state.selected_instruction = random.choice(
            st.session_state.instruction_options
        )

    st.title("AlpacaEval Visualizations")

    outer_tabs = st.tabs(["Length bias in overall win rate", "Data explorer"])

    # Load the data
    df = pd.read_json("data/model_win_rates.jsonl", lines=True, orient="records")
    # df_responses = pd.read_json("data/df_responses.jsonl", lines=True, orient="records")
    df_response_judging = pd.read_json(
        "data/df_response_judging.jsonl", lines=True, orient="records"
    )

    # Prepare the model selector options
    model_options = df_response_judging["generator_2"].unique().tolist()

    with outer_tabs[0]:
        # Define the preset groups
        presets = {
            "gpt": df[df["model_name"].str.contains("openai|gpt", case=False)][
                "model_name"
            ].tolist(),
            "claude": df[df["model_name"].str.contains("claude", case=False)][
                "model_name"
            ].tolist(),
            "moa": df[df["model_name"].str.contains("moa", case=False)][
                "model_name"
            ].tolist(),
            "llama": df[df["model_name"].str.contains("llama", case=False)][
                "model_name"
            ].tolist(),
            "custom": [],
        }

        # Add radio button for preset groups
        preset_selection = st.radio(
            "Select a preset group of models or choose 'custom' to select manually.",
            options=["custom", "gpt", "claude", "moa", "llama"],
        )

        st.divider()

        # Add multiselect for custom model selection
        if preset_selection == "custom":
            selected_models = st.multiselect(
                "Select models to highlight", options=df["model_name"].unique()
            )
        else:
            selected_models = presets[preset_selection]

        def create_scatter_plot(df, y_column, selected_models, title):
            fig = go.Figure()

            # Add scatter plots for num_words_mean and num_tokens_mean
            fig.add_trace(
                go.Scatter(
                    x=df["num_words_mean"],
                    y=df[y_column],
                    mode="markers",
                    name="words",
                    text=df["model_name"],
                    marker=dict(size=5, color="skyblue"),
                    showlegend=True,
                )
            )
            fig.add_trace(
                go.Scatter(
                    x=df["num_tokens_mean"],
                    y=df[y_column],
                    mode="markers",
                    name="tokens",
                    text=df["model_name"],
                    marker=dict(size=5, color="orange"),
                    showlegend=True,
                    visible="legendonly",  # Make 'words' trace initially visible only in legend
                )
            )

            # Highlight selected models
            if selected_models:
                selected_data = df[df["model_name"].isin(selected_models)]
                fig.add_trace(
                    go.Scatter(
                        x=selected_data["num_words_mean"],
                        y=selected_data[y_column],
                        mode="markers",
                        name="selected words",
                        text=selected_data["model_name"],
                        marker=dict(size=10, color="blue"),
                        showlegend=True,
                    )
                )
                fig.add_trace(
                    go.Scatter(
                        x=selected_data["num_tokens_mean"],
                        y=selected_data[y_column],
                        mode="markers",
                        name="selected tokens",
                        text=selected_data["model_name"],
                        marker=dict(size=10, color="orangered"),
                        showlegend=True,
                        visible="legendonly",  # Make 'selected words' trace initially visible only in legend
                    )
                )

            # Add trendlines
            def add_trendline(fig, x, y, name, color, visibility="legendonly"):
                X = sm.add_constant(df[x])
                model = sm.OLS(df[y], X).fit()
                trendline = model.predict(X)
                fig.add_trace(
                    go.Scatter(
                        x=df[x],
                        y=trendline,
                        mode="lines",
                        name=f"{name} trendline",
                        line=dict(color=color, width=2),
                        visible=visibility,  # Control the initial visibility
                    )
                )
                return model.rsquared

            r_squared_words = add_trendline(
                fig, "num_words_mean", y_column, "words", "blue", visibility=True
            )
            r_squared_tokens = add_trendline(
                fig, "num_tokens_mean", y_column, "tokens", "orangered"
            )

            # Update layout with titles and labels
            fig.update_layout(
                xaxis_title="Mean length",
                yaxis_title=(
                    "Win rate"
                    if y_column == "win_rate"
                    else (
                        "LC Win Rate"
                        if y_column == "length_controlled_winrate"
                        else "Discrete Win Rate"
                    )
                ),
                title=title,
                legend_title="Legend",
            )

            return fig, r_squared_words, r_squared_tokens

        st.markdown("## Overall win rate")
        y_column1 = "length_controlled_winrate"
        y_column2 = "win_rate"
        y_column3 = "discrete_win_rate"

        fig1, r_squared_words_1, r_squared_tokens_1 = create_scatter_plot(
            df, y_column1, selected_models, "Length-Controlled Win Rate"
        )
        fig2, r_squared_words_2, r_squared_tokens_2 = create_scatter_plot(
            df, y_column2, selected_models, "Win Rate"
        )
        fig3, r_squared_words_3, r_squared_tokens_3 = create_scatter_plot(
            df, y_column3, selected_models, "Discrete Win Rate"
        )

        # Create tabs for each chart
        tab1, tab2, tab3 = st.tabs(["LC Win Rate", "Win Rate", "Discrete Win Rate"])

        with tab1:
            col1, col2 = st.columns([3, 2])
            col1.plotly_chart(fig1)
            col2.markdown("#### Rankings")
            prepped_df = prep_rankings_table(df, "length_controlled_winrate")
            col2.dataframe(
                prepped_df,
                hide_index=True,
            )
            with st.expander("Trendline R²"):
                st.markdown(
                    f"- R² (Words vs {y_column1}): {r_squared_words_1:.2f} \n- R² (Tokens vs {y_column1}): {r_squared_tokens_1:.2f}"
                )

        with tab2:
            col1, col2 = st.columns([3, 2])
            col1.plotly_chart(fig2)
            col2.markdown("#### Rankings")
            prepped_df = prep_rankings_table(df, "win_rate")
            col2.dataframe(
                prepped_df,
                hide_index=True,
            )
            with st.expander("Trendline R²"):
                st.markdown(
                    f"- R² (Words vs {y_column2}): {r_squared_words_2:.2f} \n- R² (Tokens vs {y_column2}): {r_squared_tokens_2:.2f}"
                )

        with tab3:
            col1, col2 = st.columns([3, 2])
            col1.plotly_chart(fig3)
            col2.markdown("#### Rankings")
            prepped_df = prep_rankings_table(df, "discrete_win_rate")
            col2.dataframe(
                prepped_df,
                hide_index=True,
            )
            with st.expander("Trendline R²"):
                st.markdown(
                    f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}"
                )

        st.markdown("## Length bias in battles")

        df_response_judging_copy = df_response_judging.copy()
        if not selected_models:
            df_response_judging_copy["output_1_num_words"] = df_response_judging_copy[
                "output_1"
            ].apply(lambda x: len(x.split()))
            df_response_judging_copy["output_2_num_words"] = df_response_judging_copy[
                "output_2"
            ].apply(lambda x: len(x.split()))
            df_response_judging_copy["output_num_words_diff"] = (
                df_response_judging_copy["output_1_num_words"]
                - df_response_judging_copy["output_2_num_words"]
            )
            df_response_judging_copy["assigned_preference"] = (
                df_response_judging_copy["preference"]
                .round(0)
                .apply(get_preference_from_rounded_score)
            )
        else:
            df_response_judging_copy = df_response_judging_copy[
                df_response_judging_copy["generator_2"].isin(selected_models)
            ]
            df_response_judging_copy["output_1_num_words"] = df_response_judging_copy[
                "output_1"
            ].apply(lambda x: len(x.split()))
            df_response_judging_copy["output_2_num_words"] = df_response_judging_copy[
                "output_2"
            ].apply(lambda x: len(x.split()))
            df_response_judging_copy["output_num_words_diff"] = (
                df_response_judging_copy["output_1_num_words"]
                - df_response_judging_copy["output_2_num_words"]
            )
            df_response_judging_copy["assigned_preference"] = (
                df_response_judging_copy["preference"]
                .round(0)
                .apply(get_preference_from_rounded_score)
            )

        col1, col2 = st.columns(2)
        fig = px.scatter(
            df_response_judging_copy,
            x="output_1_num_words",
            y="output_2_num_words",
            color="assigned_preference",
            title=f"Pairwise preference based on response length",
            labels={
                "output_1_num_words": f"{fixed_model} (1) number of words",
                "output_2_num_words": "Target model (2) number of words",
            },
            color_discrete_map={
                "[1>2]": "blue",
                "[2>1]": "orangered",
                "[1=2]": "green",
            },
        )
        col1.plotly_chart(fig)

        # Plot of output_num_words_diff histogram, colored by assigned_preference.
        fig = px.histogram(
            df_response_judging_copy,
            x="output_num_words_diff",
            color="assigned_preference",
            title=f"Pairwise preference counts based on difference in response length",
            color_discrete_map={
                "[1>2]": "blue",
                "[2>1]": "orangered",
                "[1=2]": "green",
            },
            range_x=[-500, 500],
            labels={
                "output_num_words_diff": "Length difference in words between gpt4_1106_preview and target model"
            },
        )
        col2.plotly_chart(fig)

        with st.expander("Raw data"):
            st.dataframe(df)

    # Data explorer
    with outer_tabs[1]:
        # Add randomize button at the top of the app
        st.markdown("## Choose example")
        st.button(
            ":game_die: Randomize!",
            on_click=randomize_selection,
            type="primary",
        )

        left_col, right_col = st.columns([1, 3])

        st.session_state.selected_dataset = left_col.selectbox(
            "Select Dataset",
            ["all"] + df_response_judging["dataset"].dropna().unique().tolist(),
            key="dataset_selector",
            on_change=update_instruction_options,
        )
        update_instruction_options()
        st.session_state.selected_instruction = right_col.selectbox(
            f"Select Instruction ({len(st.session_state.instruction_options)} unique instructions)",
            st.session_state.instruction_options,
            key="instruction_selector",
            on_change=update_instruction,
            index=(
                st.session_state.instruction_options.index(
                    st.session_state.selected_instruction
                )
                if st.session_state.selected_instruction
                in st.session_state.instruction_options
                else 0
            ),
        )

        # All the models.
        all_models_judgings_details = df_response_judging[
            (df_response_judging["generator_1"] == fixed_model)
            & (
                df_response_judging["instruction"]
                == st.session_state.selected_instruction
            )
        ]

        st.divider()

        st.markdown(f"## Selected instruction")
        st.info(st.session_state.selected_instruction)

        st.divider()

        st.markdown(f"## Overall Battles")
        all_models_judgings_details["output_1_num_words"] = all_models_judgings_details[
            "output_1"
        ].apply(lambda x: len(x.split()))
        all_models_judgings_details["output_2_num_words"] = all_models_judgings_details[
            "output_2"
        ].apply(lambda x: len(x.split()))
        all_models_judgings_details["output_num_words_diff"] = (
            all_models_judgings_details["output_1_num_words"]
            - all_models_judgings_details["output_2_num_words"]
        )
        all_models_judgings_details["assigned_preference"] = (
            all_models_judgings_details["preference"]
            .round(0)
            .apply(get_preference_from_rounded_score)
        )

        # st.write(all_models_judgings_details)

        col1, col2, col3 = st.columns(3)

        fig = px.histogram(
            all_models_judgings_details,
            x="output_num_words_diff",
            color="assigned_preference",
            title=f"Pairwise preference counts based on difference in response length",
            color_discrete_map={
                "[1>2]": "blue",
                "[2>1]": "orangered",
                "[1=2]": "green",
            },
            range_x=[-500, 500],
            labels={
                "output_num_words_diff": "Difference in number of words between response 1 and 2.",
                "assigned_preference": "Assigned Preference",
            },
        )
        col1.plotly_chart(fig)

        # Plot of assigned preference counts.
        fig = px.histogram(
            all_models_judgings_details,
            x="assigned_preference",
            title=f"Assigned preferences for {fixed_model} vs. all models",
        )
        col2.plotly_chart(fig)

        # Models that are better than the fixed model.
        num_words_for_fixed_model = len(
            all_models_judgings_details.iloc[0]["output_1"].split()
        )
        better_models = all_models_judgings_details[
            all_models_judgings_details["assigned_preference"] == "[2>1]"
        ]

        shorter_models = better_models[
            better_models["output_2_num_words"] <= num_words_for_fixed_model
        ]
        longer_models = better_models[
            better_models["output_2_num_words"] > num_words_for_fixed_model
        ]
        col3.markdown(
            f"### Models that are better than {fixed_model} ({num_words_for_fixed_model})"
        )
        if shorter_models.size != 0:
            shorter_models_string = ""
            for _, shorter_model in shorter_models.iterrows():
                if shorter_model["generator_2"] != fixed_model:
                    shorter_models_string += f"- {shorter_model['generator_2']} ({shorter_model['output_2_num_words']})\n"
            col3.markdown("**With shorter or equal length responses:**")
            col3.markdown(shorter_models_string)
        else:
            col3.write("None")
        if longer_models.size != 0:
            longer_models_string = ""
            for _, longer_model in longer_models.iterrows():
                if longer_model["generator_2"] != fixed_model:
                    longer_models_string += f"- {longer_model['generator_2']} ({longer_model['output_2_num_words']})\n"
            col3.markdown("**With longer responses:**")
            col3.markdown(longer_models_string)
        else:
            col3.write("None")

        # Judging details.
        st.markdown(f"## Individual Battle Details")
        judging_details = df_response_judging[
            (df_response_judging["generator_1"] == fixed_model)
            & (df_response_judging["generator_2"] == st.session_state.selected_model)
            & (
                df_response_judging["instruction"]
                == st.session_state.selected_instruction
            )
        ]

        # if not judging_details.empty:
        if not judging_details["preference"].empty:
            preference = get_preference(judging_details["preference"])
            if preference == "[1>2]":
                st.write(
                    f"**{fixed_model}** is better than **{st.session_state.selected_model}**"
                )
            else:
                st.write(
                    f"**{st.session_state.selected_model}** is better than **{fixed_model}**"
                )
            st.write(
                f"- **Score:** {judging_details['preference'].round(2).item()}\n- **Assigned preference:** {preference}"
            )

            with st.expander("Additional information"):
                st.write(
                    judging_details[
                        [
                            "instruction",
                            "time_per_example",
                            "price_per_example",
                            "raw_completion",
                        ]
                    ]
                )

        # Create two columns for model selectors
        st.markdown("## Responses")
        col1, col2 = st.columns(2)

        with col1:
            st.selectbox(
                "Reference model",
                [fixed_model],
                key="fixed_model",
            )

            # Get the response string for the fixed model
            if st.session_state.selected_instruction:
                preference = get_preference(judging_details["preference"])
                response_details_fixed = df_response_judging[
                    (
                        df_response_judging["instruction"]
                        == st.session_state.selected_instruction
                    )
                    & (df_response_judging["generator_1"] == fixed_model)
                ].iloc[0]

                st.write(
                    f'Number of words: {len(response_details_fixed["output_1"].split())}'
                )

                # Display the response string
                if preference == "[1>2]":
                    st.success(response_details_fixed["output_1"])
                else:
                    st.error(response_details_fixed["output_1"])

        with col2:
            st.session_state.selected_model = st.selectbox(
                "Select Model",
                model_options,
                key="model_selector",
                on_change=update_model,
                index=(
                    model_options.index(st.session_state.selected_model)
                    if st.session_state.selected_model
                    else 0
                ),
            )

            # Get the response string for the selected model
            if (
                st.session_state.selected_model
                and st.session_state.selected_instruction
            ):
                response_details_dynamic = df_response_judging[
                    (
                        df_response_judging["instruction"]
                        == st.session_state.selected_instruction
                    )
                    & (
                        df_response_judging["generator_2"]
                        == st.session_state.selected_model
                    )
                ].iloc[0]

                st.write(
                    f'Number of words: {len(response_details_dynamic["output_2"].split())}'
                )

                # Display the response string
                if preference == "[2>1]":
                    st.success(response_details_dynamic["output_2"])
                else:
                    st.error(response_details_dynamic["output_2"])


if __name__ == "__main__":
    app()