Spaces:

CultriX
/

Tiny-LeaderBoard

Running

App Files Files Community

CultriX commited on Dec 23, 2024

Commit

982fdda

verified ·

1 Parent(s): 94d5ed8

Update app.py

Browse files

Files changed (1) hide show

app.py +171 -204

app.py CHANGED Viewed

@@ -1,53 +1,53 @@
-import streamlit as st
 import pandas as pd
-from huggingface_hub import HfApi, ModelCard
-from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
-import re
-from io import StringIO
-from yall import create_yall
 import plotly.graph_objs as go
-def calculate_pages(df, items_per_page):
-    """Calculate the number of pages needed for pagination."""
-    return -(-len(df) // items_per_page)  # Equivalent to math.ceil(len(df) / items_per_page)
-@st.cache_data
-def cached_model_info(_api, model):
-    """Fetch model information from the Hugging Face API and cache the result."""
-    try:
-        return _api.model_info(repo_id=str(model))
-    except (RepositoryNotFoundError, RevisionNotFoundError):
-        return None
-@st.cache_data
-def get_model_info(df):
-    """Get model information and update the DataFrame with likes and tags."""
-    api = HfApi()
-    with st.spinner("Fetching model information..."):
-        for index, row in df.iterrows():
-            model_info = cached_model_info(api, row['Model'].strip())
-            if model_info:
-                df.loc[index, 'Likes'] = model_info.likes
-                df.loc[index, 'Tags'] = ', '.join(model_info.tags)
-            else:
-                df.loc[index, 'Likes'] = -1
-                df.loc[index, 'Tags'] = ''
-    return df
-def convert_markdown_table_to_dataframe(md_content):
-    """Convert a markdown table to a pandas DataFrame."""
-    cleaned_content = re.sub(r'\|\s*$', '', re.sub(r'^\|\s*', '', md_content, flags=re.MULTILINE), flags=re.MULTILINE)
-    df = pd.read_csv(StringIO(cleaned_content), sep="\|", engine='python')
-    df = df.drop(0, axis=0)
-    df.columns = df.columns.str.strip()
-    model_link_pattern = r'\[(.*?)\]\((.*?)\)\s*\[.*?\]\(.*?\)'
-    df['URL'] = df['Model'].apply(lambda x: re.search(model_link_pattern, x).group(2) if re.search(model_link_pattern, x) else None)
-    df['Model'] = df['Model'].apply(lambda x: re.sub(model_link_pattern, r'\1', x))
-    return df
 def create_bar_chart(df, category):
     """Create a horizontal bar chart for the specified category."""
-    st.write(f"### {category} Scores")
     sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
     fig = go.Figure(go.Bar(
         x=sorted_df[category],
@@ -62,164 +62,131 @@ def create_bar_chart(df, category):
         margin=dict(l=20, r=20, t=20, b=20),
         title=f"Leaderboard for {category} Scores"
     )
-    st.plotly_chart(fig, use_container_width=True, height=len(df) * 35)
-def fetch_merge_configs(df):
-    """Fetch and save merge configurations for the top models."""
-    df_sorted = df.sort_values(by='Average', ascending=False)
-    try:
-        with open('/tmp/configurations.txt', 'a') as file:
-            for index, row in df_sorted.head(20).iterrows():
-                model_name = row['Model'].rstrip()
-                try:
-                    card = ModelCard.load(model_name)
-                    file.write(f'Model Name: {model_name}\n')
-                    file.write(f'Scores: {row["Average"]}\n')
-                    file.write(f'AGIEval: {row["AGIEval"]}\n')
-                    file.write(f'GPT4All: {row["GPT4All"]}\n')
-                    file.write(f'TruthfulQA: {row["TruthfulQA"]}\n')
-                    file.write(f'Bigbench: {row["Bigbench"]}\n')
-                    file.write(f'Model Card: {card}\n')
-                except Exception as e:
-                    st.error(f"Error loading model card for {model_name}: {str(e)}")
-        with open('/tmp/configurations.txt', 'r') as file:
-            content = file.read()
-            matches = re.findall(r'yaml(.*?)```', content, re.DOTALL)
-        with open('/tmp/configurations2.txt', 'w') as file:
-            for row, match in zip(df_sorted[['Model', 'Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']].head(20).values, matches):
-                file.write(f'Model Name: {row[0]}\n')
-                file.write(f'Scores: {row[1]}\n')
-                file.write(f'AGIEval: {row[2]}\n')
-                file.write(f'GPT4All: {row[3]}\n')
-                file.write(f'TruthfulQA: {row[4]}\n')
-                file.write(f'Bigbench: {row[5]}\n')
-                file.write('yaml' + match + '```\n')
-    except Exception as e:
-        st.error(f"Error while fetching merge configs: {str(e)}")
-def main():
-    """Main function to set up the Streamlit app and display the leaderboard."""
-    st.set_page_config(page_title="YALL - Yet Another LLM Leaderboard", layout="wide")
-    st.title("🏆 YALL - Yet Another LLM Leaderboard")
-    st.markdown("Leaderboard made with 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval) using [Nous](https://huggingface.co/NousResearch) benchmark suite.")
-    content = create_yall()
-    tab1, tab2 = st.tabs(["🏆 Leaderboard", "📝 About"])
-    with tab1:
-        if content:
-            try:
-                score_columns = ['Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']
-                full_df = convert_markdown_table_to_dataframe(content)
-                for col in score_columns:
-                    full_df[col] = pd.to_numeric(full_df[col].str.strip(), errors='coerce')
-                full_df = get_model_info(full_df)
-                full_df['Tags'] = full_df['Tags'].fillna('')
-                df = pd.DataFrame(columns=full_df.columns)
-                show_phi = st.checkbox("Phi (2.8B)", value=True)
-                show_mistral = st.checkbox("Mistral (7B)", value=True)
-                show_other = st.checkbox("Other", value=True)
-                dfs_to_concat = []
-                if show_phi:
-                    dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('phi,|phi-msft,')])
-                if show_mistral:
-                    dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('mistral,')])
-                if show_other:
-                    other_df = full_df[~full_df['Tags'].str.lower().str.contains('phi,|phi-msft,|mistral,')]
-                    dfs_to_concat.append(other_df)
-                if dfs_to_concat:
-                    df = pd.concat(dfs_to_concat, ignore_index=True)
-                search_query = st.text_input("Search models", "")
-                if search_query:
-                    df = df[df['Model'].str.contains(search_query, case=False)]
-                items_per_page = 50
-                pages = calculate_pages(df, items_per_page)
-                page = st.selectbox("Page", list(range(1, pages + 1)))
-                df = df.sort_values(by='Average', ascending=False)
-                start = (page - 1) * items_per_page
-                end = start + items_per_page
-                df = df[start:end]
-                selected_benchmarks = st.multiselect('Select benchmarks to include in the average', score_columns, default=score_columns)
-                if selected_benchmarks:
-                    df['Filtered Average'] = df[selected_benchmarks].mean(axis=1)
-                    df = df.sort_values(by='Filtered Average', ascending=False)
-                    st.dataframe(
-                        df[['Model'] + selected_benchmarks + ['Filtered Average', 'Likes', 'URL']],
-                        use_container_width=True,
-                        column_config={
-                            "Likes": st.column_config.NumberColumn(
-                                "Likes",
-                                help="Number of likes on Hugging Face",
-                                format="%d ❤️",
-                            ),
-                            "URL": st.column_config.LinkColumn("URL"),
-                        },
-                        hide_index=True,
-                        height=len(df) * 37,
-                    )
-                selected_models = st.multiselect('Select models to compare', df['Model'].unique())
-                comparison_df = df[df['Model'].isin(selected_models)]
-                st.dataframe(comparison_df)
-                if st.button("Export to CSV"):
-                    csv_data = df.to_csv(index=False)
-                    st.download_button(
-                        label="Download CSV",
-                        data=csv_data,
-                        file_name="leaderboard.csv",
-                        key="download-csv",
-                        help="Click to download the CSV file",
-                    )
-                if st.button("Fetch Merge-Configs"):
-                    fetch_merge_configs(full_df)
-                    st.success("Merge configurations have been fetched and saved.")
-                create_bar_chart(df, 'Filtered Average')
-                col1, col2 = st.columns(2)
-                with col1:
-                    create_bar_chart(df, score_columns[1])
-                with col2:
-                    create_bar_chart(df, score_columns[2])
-                col3, col4 = st.columns(2)
-                with col3:
-                    create_bar_chart(df, score_columns[3])
-                with col4:
-                    create_bar_chart(df, score_columns[4])
-            except Exception as e:
-                st.error("An error occurred while processing the markdown table.")
-                st.error(str(e))
-        else:
-            st.error("Failed to download the content from the URL provided.")
-    with tab2:
-        st.markdown('''
-            ### Nous benchmark suite
-            Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks:
-            * [**AGIEval**](https://arxiv.org/abs/2304.06364) (0-shot): `agieval_aqua_rat,agieval_logiqa_en,agieval_lsat_ar,agieval_lsat_lr,agieval_lsat_rc,agieval_sat_en,agieval_sat_en_without_passage,agieval_sat_math`
-            * **GPT4ALL** (0-shot): `hellaswag,openbookqa,winogrande,arc_easy,arc_challenge,boolq,piqa`
-            * [**TruthfulQA**](https://arxiv.org/abs/2109.07958) (0-shot): `truthfulqa_mc`
-            * [**Bigbench**](https://arxiv.org/abs/2206.04615) (0-shot): `bigbench_causal_judgement,bigbench_date_understanding,bigbench_disambiguation_qa,bigbench_geometric_shapes,bigbench_logical_deduction_five_objects,bigbench_logical_deduction_seven_objects,bigbench_logical_deduction_three_objects,bigbench_movie_recommendation,bigbench_navigate,bigbench_reasoning_about_colored_objects,bigbench_ruin_names,bigbench_salient_translation_error_detection,bigbench_snarks,bigbench_sports_understanding,bigbench_temporal_sequences,bigbench_tracking_shuffled_objects_five_objects,bigbench_tracking_shuffled_objects_seven_objects,bigbench_tracking_shuffled_objects_three_objects`
-            ### Reproducibility
-            You can easily reproduce these results using 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval/tree/master), a colab notebook that automates the evaluation process (benchmark: `nous`). This will upload the results to GitHub as gists. You can find the entire table with the links to the detailed results [here](https://gist.github.com/mlabonne/90294929a2dbcb8877f9696f28105fdf).
-            ### Clone this space
-            You can create your own leaderboard with your LLM AutoEval results on GitHub Gist. You just need to clone this space and specify two variables:
-            * Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
-            * Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
-            A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations.
-        ''')
 if __name__ == "__main__":
-    main()

+import gradio as gr
 import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
 import plotly.graph_objs as go
+import plotly.offline as py
+from io import StringIO
+import base64
+py.init_notebook_mode(connected=True)
+# Read the data from the file
+def parse_data(file_content):
+    lines = file_content.splitlines()
+    model_data = []
+    current_model = None
+    for line in lines:
+        line = line.strip()
+        if line.startswith('hf (pretrained='):
+            current_model = line.split('pretrained=')[1].split(',')[0]
+        elif line and current_model:
+            if not line.startswith('-') and '|' in line:
+                # Parse table row
+                parts = [p.strip() for p in line.split('|')]
+                if len(parts) >= 2:  # Ensure the correct number of columns
+                    try:
+                        task_name = parts[0]
+                        value = float(parts[1])  # Extract the numeric value
+                        model_data.append([
+                            current_model,
+                            task_name,  # Task name
+                            value
+                        ])
+                    except ValueError:
+                        print(f"Skipping row due to invalid value: {parts}")
+    if not model_data:
+        print("No valid data found in the file.")
+    return pd.DataFrame(model_data, columns=['Model', 'Task', 'Value'])
+# Calculate average performance
+def calculate_averages(data):
+    if data.empty:
+        print("No data available to calculate averages.")
+        return pd.DataFrame(columns=['Model', 'Average Performance'])
+    return data.groupby('Model')['Value'].mean().reset_index().rename(columns={'Value': 'Average Performance'})
 def create_bar_chart(df, category):
     """Create a horizontal bar chart for the specified category."""
     sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
     fig = go.Figure(go.Bar(
         x=sorted_df[category],
         margin=dict(l=20, r=20, t=20, b=20),
         title=f"Leaderboard for {category} Scores"
     )
+    return fig
+def generate_visualizations(data, averages):
+    sns.set(style='whitegrid')
+    if averages.empty:
+        print("No averages to visualize.")
+        return None, None, None, None, None, None
+    averages = averages.sort_values(by='Average Performance')
+    # Matplotlib average performance plot
+    plt.figure(figsize=(12, 8))
+    sns.barplot(data=averages, x='Average Performance', y='Model', palette='viridis')
+    plt.title('Average Performance of Models', fontsize=16)
+    plt.xlabel('Average Performance', fontsize=12)
+    plt.ylabel('Model', fontsize=12)
+    plt.tight_layout()
+    # Save the plot to a buffer
+    buffer_avg = StringIO()
+    plt.savefig(buffer_avg, format='png')
+    buffer_avg.seek(0)
+    image_avg = base64.b64encode(buffer_avg.read()).decode('utf-8')
+    plt.close()
+    # Line plot for task performance by model
+    sorted_models = averages['Model'].tolist()
+    data['Model'] = pd.Categorical(data['Model'], categories=sorted_models, ordered=True)
+    data = data.sort_values(by=['Model', 'Task'])
+    if data.empty:
+        print("No data available for line plot.")
+        return image_avg, None, None, None, None, None
+    plt.figure(figsize=(14, 10))
+    sns.lineplot(data=data, x='Task', y='Value', hue='Model', marker='o')
+    plt.title('Task Performance by Model', fontsize=16)
+    plt.xlabel('Task', fontsize=12)
+    plt.ylabel('Performance', fontsize=12)
+    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Model')
+    plt.xticks(rotation=45)
+    plt.tight_layout()
+    # Save the line plot to a buffer
+    buffer_line = StringIO()
+    plt.savefig(buffer_line, format='png')
+    buffer_line.seek(0)
+    image_line = base64.b64encode(buffer_line.read()).decode('utf-8')
+    plt.close()
+    # Heatmap of task performance
+    pivot_table = data.pivot_table(index='Task', columns='Model', values='Value')
+    plt.figure(figsize=(12, 10))
+    sns.heatmap(pivot_table, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
+    plt.title('Task Performance Heatmap', fontsize=16)
+    plt.xlabel('Model', fontsize=12)
+    plt.ylabel('Task', fontsize=12)
+    plt.tight_layout()
+    # Save the heatmap to a buffer
+    buffer_heatmap = StringIO()
+    plt.savefig(buffer_heatmap, format='png')
+    buffer_heatmap.seek(0)
+    image_heatmap = base64.b64encode(buffer_heatmap.read()).decode('utf-8')
+    plt.close()
+    # Boxplot of performance distribution per model
+    plt.figure(figsize=(12, 8))
+    sns.boxplot(data=data, x='Model', y='Value', palette='Set2')
+    plt.title('Performance Distribution per Model', fontsize=16)
+    plt.xlabel('Model', fontsize=12)
+    plt.ylabel('Performance', fontsize=12)
+    plt.xticks(rotation=45)
+    plt.tight_layout()
+    # Save the boxplot to a buffer
+    buffer_boxplot = StringIO()
+    plt.savefig(buffer_boxplot, format='png')
+    buffer_boxplot.seek(0)
+    image_boxplot = base64.b64encode(buffer_boxplot.read()).decode('utf-8')
+    plt.close()
+    # Create plotly bar charts
+    fig1 = create_bar_chart(averages, 'Average Performance')
+    plotly_avg = fig1.to_html(full_html=False)
+    plotly_tasks = {}
+    # Assuming you have tasks in the dataframe and want to display it
+    tasks = data['Task'].unique()
+    for task in tasks:
+        task_data = data[data['Task'] == task]
+        fig2 = create_bar_chart(task_data, 'Value')
+        fig2.update_layout(title=f"Leaderboard for {task} Scores")
+        plotly_tasks[task] = fig2.to_html(full_html=False)
+    return image_avg, image_line, image_heatmap, image_boxplot, plotly_avg, plotly_tasks
+def process_and_visualize(file_content):
+    data = parse_data(file_content)
+    averages = calculate_averages(data)
+    image_avg, image_line, image_heatmap, image_boxplot, plotly_avg, plotly_tasks = generate_visualizations(data, averages)
+    output_text = f"Average Performance per Model:\n{averages.sort_values(by='Average Performance').to_string()}"
+    return output_text, image_avg, image_line, image_heatmap, image_boxplot, plotly_avg, plotly_tasks
 if __name__ == "__main__":
+    iface = gr.Interface(
+        fn=process_and_visualize,
+        inputs=gr.Textbox(lines=10, label="Paste your data here"),
+        outputs=[
+            gr.Textbox(label="Average Performance per Model"),
+            gr.Image(label="Matplotlib Average Performance Chart"),
+            gr.Image(label="Matplotlib Task Performance Line Chart"),
+            gr.Image(label="Matplotlib Task Performance Heatmap"),
+            gr.Image(label="Matplotlib Performance Distribution Boxplot"),
+            gr.HTML(label="Plotly Average Performance Chart"),
+            gr.Accordion(
+                [gr.HTML(label=f"Plotly {task} Chart") for task in ['tinyArc', 'tinyHellaswag', 'tinyMMLU', 'tinyTruthfulQA', 'tinyTruthfulQA_mc1', 'tinyWinogrande']], label="Task Charts"),
+        ],
+        title="LLM Benchmark Visualizer",
+        description="Upload your LLM benchmark data and visualize the results."
+    )
+    iface.launch(share=True)