File size: 11,856 Bytes
17aa8f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import streamlit as st
import pandas as pd
from huggingface_hub import HfApi, ModelCard
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
import re
from io import StringIO
from yall import create_yall
import plotly.graph_objs as go

def calculate_pages(df, items_per_page):
    """Calculate the number of pages needed for pagination."""
    return -(-len(df) // items_per_page)  # Equivalent to math.ceil(len(df) / items_per_page)

@st.cache_data
def cached_model_info(_api, model):
    """Fetch model information from the Hugging Face API and cache the result."""
    try:
        return _api.model_info(repo_id=str(model))
    except (RepositoryNotFoundError, RevisionNotFoundError):
        return None

@st.cache_data
def get_model_info(df):
    """Get model information and update the DataFrame with likes and tags."""
    api = HfApi()
    with st.spinner("Fetching model information..."):
        for index, row in df.iterrows():
            model_info = cached_model_info(api, row['Model'].strip())
            if model_info:
                df.loc[index, 'Likes'] = model_info.likes
                df.loc[index, 'Tags'] = ', '.join(model_info.tags)
            else:
                df.loc[index, 'Likes'] = -1
                df.loc[index, 'Tags'] = ''
    return df

def convert_markdown_table_to_dataframe(md_content):
    """Convert a markdown table to a pandas DataFrame."""
    cleaned_content = re.sub(r'\|\s*$', '', re.sub(r'^\|\s*', '', md_content, flags=re.MULTILINE), flags=re.MULTILINE)
    df = pd.read_csv(StringIO(cleaned_content), sep="\|", engine='python')
    df = df.drop(0, axis=0)
    df.columns = df.columns.str.strip()
    model_link_pattern = r'\[(.*?)\]\((.*?)\)\s*\[.*?\]\(.*?\)'
    df['URL'] = df['Model'].apply(lambda x: re.search(model_link_pattern, x).group(2) if re.search(model_link_pattern, x) else None)
    df['Model'] = df['Model'].apply(lambda x: re.sub(model_link_pattern, r'\1', x))
    return df

def create_bar_chart(df, category):
    """Create a horizontal bar chart for the specified category."""
    st.write(f"### {category} Scores")
    sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
    fig = go.Figure(go.Bar(
        x=sorted_df[category],
        y=sorted_df['Model'],
        orientation='h',
        marker=dict(color=sorted_df[category], colorscale='Viridis'),
        hoverinfo='x+y',
        text=sorted_df[category],
        textposition='auto'
    ))
    fig.update_layout(
        margin=dict(l=20, r=20, t=20, b=20),
        title=f"Leaderboard for {category} Scores"
    )
    st.plotly_chart(fig, use_container_width=True, height=len(df) * 35)

def fetch_merge_configs(df):
    """Fetch and save merge configurations for the top models."""
    df_sorted = df.sort_values(by='Average', ascending=False)
    try:
        with open('/tmp/configurations.txt', 'a') as file:
            for index, row in df_sorted.head(20).iterrows():
                model_name = row['Model'].rstrip()
                try:
                    card = ModelCard.load(model_name)
                    file.write(f'Model Name: {model_name}\n')
                    file.write(f'Scores: {row["Average"]}\n')
                    file.write(f'AGIEval: {row["AGIEval"]}\n')
                    file.write(f'GPT4All: {row["GPT4All"]}\n')
                    file.write(f'TruthfulQA: {row["TruthfulQA"]}\n')
                    file.write(f'Bigbench: {row["Bigbench"]}\n')
                    file.write(f'Model Card: {card}\n')
                except Exception as e:
                    st.error(f"Error loading model card for {model_name}: {str(e)}")
        with open('/tmp/configurations.txt', 'r') as file:
            content = file.read()
            matches = re.findall(r'yaml(.*?)```', content, re.DOTALL)
        with open('/tmp/configurations2.txt', 'w') as file:
            for row, match in zip(df_sorted[['Model', 'Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']].head(20).values, matches):
                file.write(f'Model Name: {row[0]}\n')
                file.write(f'Scores: {row[1]}\n')
                file.write(f'AGIEval: {row[2]}\n')
                file.write(f'GPT4All: {row[3]}\n')
                file.write(f'TruthfulQA: {row[4]}\n')
                file.write(f'Bigbench: {row[5]}\n')
                file.write('yaml' + match + '```\n')
    except Exception as e:
        st.error(f"Error while fetching merge configs: {str(e)}")

def main():
    """Main function to set up the Streamlit app and display the leaderboard."""
    st.set_page_config(page_title="YALL - Yet Another LLM Leaderboard", layout="wide")
    st.title("🏆 YALL - Yet Another LLM Leaderboard")
    st.markdown("Leaderboard made with 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval) using [Nous](https://huggingface.co/NousResearch) benchmark suite.")
    content = create_yall()
    tab1, tab2 = st.tabs(["🏆 Leaderboard", "📝 About"])

    with tab1:
        if content:
            try:
                score_columns = ['Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']
                full_df = convert_markdown_table_to_dataframe(content)

                for col in score_columns:
                    full_df[col] = pd.to_numeric(full_df[col].str.strip(), errors='coerce')

                full_df = get_model_info(full_df)
                full_df['Tags'] = full_df['Tags'].fillna('')
                df = pd.DataFrame(columns=full_df.columns)

                show_phi = st.checkbox("Phi (2.8B)", value=True)
                show_mistral = st.checkbox("Mistral (7B)", value=True)
                show_other = st.checkbox("Other", value=True)

                dfs_to_concat = []
                if show_phi:
                    dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('phi,|phi-msft,')])
                if show_mistral:
                    dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('mistral,')])
                if show_other:
                    other_df = full_df[~full_df['Tags'].str.lower().str.contains('phi,|phi-msft,|mistral,')]
                    dfs_to_concat.append(other_df)

                if dfs_to_concat:
                    df = pd.concat(dfs_to_concat, ignore_index=True)

                search_query = st.text_input("Search models", "")
                if search_query:
                    df = df[df['Model'].str.contains(search_query, case=False)]

                items_per_page = 50
                pages = calculate_pages(df, items_per_page)
                page = st.selectbox("Page", list(range(1, pages + 1)))

                df = df.sort_values(by='Average', ascending=False)
                start = (page - 1) * items_per_page
                end = start + items_per_page
                df = df[start:end]

                selected_benchmarks = st.multiselect('Select benchmarks to include in the average', score_columns, default=score_columns)

                if selected_benchmarks:
                    df['Filtered Average'] = df[selected_benchmarks].mean(axis=1)
                    df = df.sort_values(by='Filtered Average', ascending=False)
                    st.dataframe(
                        df[['Model'] + selected_benchmarks + ['Filtered Average', 'Likes', 'URL']],
                        use_container_width=True,
                        column_config={
                            "Likes": st.column_config.NumberColumn(
                                "Likes",
                                help="Number of likes on Hugging Face",
                                format="%d ❤️",
                            ),
                            "URL": st.column_config.LinkColumn("URL"),
                        },
                        hide_index=True,
                        height=len(df) * 37,
                    )

                selected_models = st.multiselect('Select models to compare', df['Model'].unique())
                comparison_df = df[df['Model'].isin(selected_models)]
                st.dataframe(comparison_df)

                if st.button("Export to CSV"):
                    csv_data = df.to_csv(index=False)
                    st.download_button(
                        label="Download CSV",
                        data=csv_data,
                        file_name="leaderboard.csv",
                        key="download-csv",
                        help="Click to download the CSV file",
                    )
                if st.button("Fetch Merge-Configs"):
                    fetch_merge_configs(full_df)
                    st.success("Merge configurations have been fetched and saved.")

                create_bar_chart(df, 'Filtered Average')

                col1, col2 = st.columns(2)
                with col1:
                    create_bar_chart(df, score_columns[1])
                with col2:
                    create_bar_chart(df, score_columns[2])

                col3, col4 = st.columns(2)
                with col3:
                    create_bar_chart(df, score_columns[3])
                with col4:
                    create_bar_chart(df, score_columns[4])

            except Exception as e:
                st.error("An error occurred while processing the markdown table.")
                st.error(str(e))
        else:
            st.error("Failed to download the content from the URL provided.")
            
    with tab2:
        st.markdown('''
            ### Nous benchmark suite
            Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks:
            * [**AGIEval**](https://arxiv.org/abs/2304.06364) (0-shot): `agieval_aqua_rat,agieval_logiqa_en,agieval_lsat_ar,agieval_lsat_lr,agieval_lsat_rc,agieval_sat_en,agieval_sat_en_without_passage,agieval_sat_math`
            * **GPT4ALL** (0-shot): `hellaswag,openbookqa,winogrande,arc_easy,arc_challenge,boolq,piqa`
            * [**TruthfulQA**](https://arxiv.org/abs/2109.07958) (0-shot): `truthfulqa_mc`
            * [**Bigbench**](https://arxiv.org/abs/2206.04615) (0-shot): `bigbench_causal_judgement,bigbench_date_understanding,bigbench_disambiguation_qa,bigbench_geometric_shapes,bigbench_logical_deduction_five_objects,bigbench_logical_deduction_seven_objects,bigbench_logical_deduction_three_objects,bigbench_movie_recommendation,bigbench_navigate,bigbench_reasoning_about_colored_objects,bigbench_ruin_names,bigbench_salient_translation_error_detection,bigbench_snarks,bigbench_sports_understanding,bigbench_temporal_sequences,bigbench_tracking_shuffled_objects_five_objects,bigbench_tracking_shuffled_objects_seven_objects,bigbench_tracking_shuffled_objects_three_objects`
            ### Reproducibility
            You can easily reproduce these results using 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval/tree/master), a colab notebook that automates the evaluation process (benchmark: `nous`). This will upload the results to GitHub as gists. You can find the entire table with the links to the detailed results [here](https://gist.github.com/mlabonne/90294929a2dbcb8877f9696f28105fdf).
            ### Clone this space
            You can create your own leaderboard with your LLM AutoEval results on GitHub Gist. You just need to clone this space and specify two variables:
            * Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
            * Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
            A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations.
        ''')

if __name__ == "__main__":
    main()