Spaces:

nexar-ai
/

nexar-driving-leaderboard

Running

App Files Files Community

Roni Goldshmidt commited on Feb 11

Commit

796891d

1 Parent(s): 0306e3a

Initial leaderboard setup

Browse files

Files changed (10) hide show

README-Copy1.md +14 -0
app.py +284 -0
comparison.py +670 -0
nexar_logo.png +0 -0
requirements.txt +6 -0
results/GPT-4o.csv +0 -0
results/Gemini-2.0-flash-lite.csv +0 -0
results/Gemini-2.0-flash.csv +0 -0
results/Gemini-2.0-pro.csv +0 -0
results/Labels.csv +0 -0

README-Copy1.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Nexar Dashcam Leaderboard
+emoji: 🌖
+colorFrom: green
+colorTo: red
+sdk: streamlit
+sdk_version: 1.42.0
+app_file: app.py
+pinned: false
+license: mit
+short_description: Benchmarking driving event classification & visual insights
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import streamlit as st
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from comparison import ModelEvaluator, ModelComparison
+import matplotlib.pyplot as plt
+import seaborn as sns
+import io
+import os
+import base64
+st.set_page_config(
+    page_title="Nexar Dashcam Leaderboard",
+    page_icon="nexar_logo.png",
+    layout="wide"
+)
+st.markdown("""
+    <style>
+    .main { padding: 2rem; }
+    .stTabs [data-baseweb="tab-list"] { gap: 8px; }
+    .stTabs [data-baseweb="tab"] {
+        padding: 8px 16px;
+        border-radius: 4px;
+    }
+    .metric-card {
+        background-color: #f8f9fa;
+        padding: 20px;
+        border-radius: 10px;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+    }
+    </style>
+""", unsafe_allow_html=True)
+col1, col2 = st.columns([0.15, 0.85])
+with col1:
+    st.image("nexar_logo.png", width=600)
+with col2:
+    st.title("Nexar Dashcam Leaderboard")
+@st.cache_data
+def load_data(directory='results', labels_filename='Labels.csv'):
+    labels_path = os.path.join(directory, labels_filename)
+    df_labels = pd.read_csv(labels_path)
+    evaluators = []
+    for filename in os.listdir(directory):
+        if filename.endswith('.csv') and filename != labels_filename:
+            model_name = os.path.splitext(filename)[0]
+            df_model = pd.read_csv(os.path.join(directory, filename))
+            evaluator = ModelEvaluator(df_labels, df_model, model_name)
+            evaluators.append(evaluator)
+    model_comparison = ModelComparison(evaluators)
+    return model_comparison
+if 'model_comparison' not in st.session_state:
+    st.session_state.model_comparison = load_data()
+    st.session_state.leaderboard_df = st.session_state.model_comparison.transform_to_leaderboard()
+    st.session_state.combined_df = st.session_state.model_comparison.combined_df
+tab1, tab2, tab3, tab4 = st.tabs([
+    "📈 Leaderboard",
+    "🎯 Category Analysis",
+    "📊 Class Performance",
+    "🔍 Detailed Metrics"
+])
+def style_dataframe(df):
+    numeric_cols = df.select_dtypes(include=['float64']).columns
+    def background_gradient(s):
+        normalized = (s - s.min()) / (s.max() - s.min())
+        normalized = normalized.fillna(0)  # Handle NaN values
+        return ['background: linear-gradient(90deg, rgba(52, 152, 219, 0.2) {}%, transparent {}%)'.format(
+            int(val * 100), int(val * 100)) for val in normalized]
+    def highlight_max(s):
+        is_max = s == s.max()
+        return ['font-weight: bold; color: #2ecc71' if v else '' for v in is_max]
+    styled = df.style\
+        .format({col: '{:.2f}%' for col in numeric_cols})\
+        .apply(background_gradient, subset=numeric_cols)\
+        .apply(highlight_max, subset=numeric_cols)\
+        .set_properties(**{
+            'background-color': '#f8f9fa',
+            'padding': '10px',
+            'border': '1px solid #dee2e6',
+            'text-align': 'center'
+        })\
+        .set_table_styles([
+            {'selector': 'th', 'props': [
+                ('background-color', '#4a90e2'),
+                ('color', 'white'),
+                ('font-weight', 'bold'),
+                ('padding', '10px'),
+                ('text-align', 'center')
+            ]},
+            {'selector': 'tr:hover', 'props': [
+                ('background-color', '#edf2f7')
+            ]}
+        ])
+    return styled
+with tab1:
+    st.subheader("Model Performance Leaderboard")
+    sort_col = st.selectbox(
+        "Sort by metric:",
+        options=[col for col in st.session_state.leaderboard_df.columns if col not in ['Rank', 'Model']],
+        key='leaderboard_sort'
+    )
+    sorted_df = st.session_state.leaderboard_df.sort_values(by=sort_col, ascending=False)
+    st.dataframe(
+        style_dataframe(sorted_df),
+        use_container_width=True,
+        height=400
+    )
+    # Category performance bar plot
+    metrics = ['F1 Score', 'Precision', 'Recall']
+    selected_metric = st.selectbox("Select Metric for Category Analysis:", metrics)
+    category_data = st.session_state.combined_df[
+        st.session_state.combined_df['Class'].str.contains('Overall')
+    ]
+    fig = px.bar(
+        category_data,
+        x='Category',
+        y=selected_metric,
+        color='Model',
+        barmode='group',
+        title=f'Category-level {selected_metric} by Model',
+    )
+    fig.update_layout(
+        xaxis_title="Category",
+        yaxis_title=selected_metric,
+        legend_title="Model"
+    )
+    st.plotly_chart(fig, use_container_width=True)
+with tab2:
+    st.subheader("Category-level Analysis")
+    categories = st.session_state.combined_df['Category'].unique()
+    selected_category = st.selectbox("Select Category:", categories)
+    col1, col2 = st.columns(2)
+    with col1:
+        category_data = st.session_state.combined_df[
+            st.session_state.combined_df['Class'].str.contains('Overall')
+        ]
+        fig = px.bar(
+            category_data,
+            x='Category',
+            y=selected_metric,
+            color='Model',
+            barmode='group',
+            title=f'{selected_metric} by Category'
+        )
+        st.plotly_chart(fig, use_container_width=True)
+    with col2:
+        cat_data = st.session_state.combined_df[
+            (st.session_state.combined_df['Category'] == selected_category) &
+            (~st.session_state.combined_df['Class'].str.contains('Overall'))
+        ]
+        fig = go.Figure()
+        for model in cat_data['Model'].unique():
+            model_data = cat_data[cat_data['Model'] == model]
+            fig.add_trace(go.Scatterpolar(
+                r=model_data[selected_metric],
+                theta=model_data['Class'],
+                name=model,
+                fill='toself'
+            ))
+        fig.update_layout(
+            polar=dict(
+                radialaxis=dict(
+                    visible=True,
+                    range=[0, 1]
+                )
+            ),
+            showlegend=True,
+            title=f'{selected_metric} Distribution for {selected_category}'
+        )
+        st.plotly_chart(fig, use_container_width=True)
+with tab3:
+    st.subheader("Class-level Performance")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        selected_category = st.selectbox(
+            "Select Category:",
+            categories,
+            key='class_category'
+        )
+    with col2:
+        selected_metric = st.selectbox(
+            "Select Metric:",
+            metrics,
+            key='class_metric'
+        )
+    with col3:
+        selected_models = st.multiselect(
+            "Select Models:",
+            st.session_state.combined_df['Model'].unique(),
+            default=st.session_state.combined_df['Model'].unique()
+        )
+    class_data = st.session_state.combined_df[
+        (st.session_state.combined_df['Category'] == selected_category) &
+        (~st.session_state.combined_df['Class'].str.contains('Overall')) &
+        (st.session_state.combined_df['Model'].isin(selected_models))
+    ]
+    fig = px.bar(
+        class_data,
+        x='Class',
+        y=selected_metric,
+        color='Model',
+        barmode='group',
+        title=f'{selected_metric} by Class for {selected_category}'
+    )
+    st.plotly_chart(fig, use_container_width=True)
+    fig = px.scatter(
+        class_data,
+        x='Precision',
+        y='Recall',
+        color='Model',
+        size='Support',
+        hover_data=['Class'],
+        title=f'Precision vs Recall for {selected_category}'
+    )
+    fig.update_traces(marker=dict(sizeref=2.*max(class_data['Support'])/40.**2))
+    st.plotly_chart(fig, use_container_width=True)
+with tab4:
+    st.subheader("Detailed Metrics Analysis")
+    selected_model = st.selectbox(
+        "Select Model for Detailed Analysis:",
+        st.session_state.combined_df['Model'].unique()
+    )
+    model_data = st.session_state.combined_df[
+        st.session_state.combined_df['Model'] == selected_model
+    ]
+    st.markdown("### Detailed Metrics Table")
+    detailed_metrics = model_data.pivot_table(
+        index='Category',
+        columns='Class',
+        values=['F1 Score', 'Precision', 'Recall']
+    ).round(4)
+    st.dataframe(style_dataframe(detailed_metrics), use_container_width=True)
+    csv = detailed_metrics.to_csv().encode()
+    st.download_button(
+        "Download Detailed Metrics",
+        csv,
+        f"detailed_metrics_{selected_model}.csv",
+        "text/csv",
+        key='download-csv'
+    )
+st.markdown("---")
+st.markdown("Dashboard created for model evaluation and comparison.")

comparison.py ADDED Viewed

	@@ -0,0 +1,670 @@

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning, message="y_pred contains classes not in y_true")
+sns.set_style("whitegrid")
+class ModelEvaluator:
+    def __init__(self, df_labels, df_predictions, model_name):
+        """
+        Initialize the evaluator with ground truth labels and model predictions.
+        """
+        self.df_labels = df_labels
+        self.df_predictions = df_predictions
+        self.model_name = model_name
+        self.metrics_df = self.compute_metrics()
+    def merge_data(self):
+        """Merge ground truth labels with predictions based on 'id'."""
+        merged_df = pd.merge(self.df_labels, self.df_predictions, on='id', suffixes=('_true', '_pred'))
+        return merged_df
+    def compute_metrics(self):
+        """Compute precision, recall, F1-score, accuracy, and balanced accuracy for each class and category."""
+        merged_df = self.merge_data()
+        categories = ['main-event', 'location', 'zone', 'light-conditions', 'weather-conditions', 'vehicles-density']
+        results = []
+        for category in categories:
+            y_true = merged_df[f"{category}_true"].astype(str)
+            y_pred = merged_df[f"{category}_pred"].astype(str)
+            labels = sorted(set(y_true) | set(y_pred))
+            class_precisions = precision_score(y_true, y_pred, labels=labels, average=None, zero_division=0)
+            class_recalls = recall_score(y_true, y_pred, labels=labels, average=None, zero_division=0)
+            class_f1 = f1_score(y_true, y_pred, labels=labels, average=None, zero_division=0)
+            overall_precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
+            overall_recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
+            overall_f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
+            overall_accuracy = accuracy_score(y_true, y_pred)
+            overall_balanced_acc = balanced_accuracy_score(y_true, y_pred)
+            for i, label in enumerate(labels):
+                results.append({
+                    "Model": self.model_name,
+                    "Category": category,
+                    "Class": label,
+                    "Precision": class_precisions[i],
+                    "Recall": class_recalls[i],
+                    "F1 Score": class_f1[i],
+                    "Accuracy": np.nan,
+                    "Balanced Acc.": np.nan,
+                    "Support": (y_true == label).sum()
+                })
+            results.append({
+                "Model": self.model_name,
+                "Category": category,
+                "Class": f"Overall ({category})",
+                "Precision": overall_precision,
+                "Recall": overall_recall,
+                "F1 Score": overall_f1,
+                "Accuracy": overall_accuracy,
+                "Balanced Acc.": overall_balanced_acc,
+                "Support": len(y_true)
+            })
+        df_res = pd.DataFrame(results)
+        return df_res.loc[df_res['Support']>0].reset_index(drop=True)
+    def get_metrics_df(self):
+        """Return the computed metrics DataFrame."""
+        return self.metrics_df
+class ModelComparison:
+    def __init__(self, evaluators):
+        """
+        Compare multiple models based on their evaluation results.
+        :param evaluators: List of ModelEvaluator instances.
+        """
+        self.evaluators = evaluators
+        self.combined_df = self.aggregate_metrics()
+    def aggregate_metrics(self):
+        """Merge evaluation metrics from multiple models into a single DataFrame."""
+        dfs = [evaluator.get_metrics_df() for evaluator in self.evaluators]
+        return pd.concat(dfs, ignore_index=True)
+    def plot_category_comparison(self, metric="F1 Score"):
+        """Compare models at the category level using a grouped bar chart with consistent styling."""
+        df = self.combined_df[self.combined_df['Class'].str.contains("Overall")]
+        plt.figure(figsize=(12, 6))
+        colors = sns.color_palette("Set2", len(df["Model"].unique()))  # Consistent palette
+        ax = sns.barplot(
+            data=df, x="Category", y=metric, hue="Model", palette=colors, edgecolor="black", alpha=0.85
+        )
+        plt.title(f"{metric} Comparison Across Categories", fontsize=14, fontweight="bold")
+        plt.ylim(0, 1)
+        plt.xticks(rotation=45, fontsize=12)
+        plt.yticks(fontsize=12)
+        plt.xlabel("Category", fontsize=12)
+        plt.ylabel(metric, fontsize=12)
+        plt.legend(title="Model", fontsize=11, loc="upper left")
+        plt.grid(axis="y", linestyle="--", alpha=0.6)
+        plt.tight_layout()
+        plt.show()
+    def plot_per_class_comparison(self, category, metric="F1 Score"):
+        """Compare models for a specific category across individual classes with a standardized design."""
+        df = self.combined_df[(self.combined_df["Category"] == category) & (~self.combined_df["Class"].str.contains("Overall"))]
+        plt.figure(figsize=(12, 6))
+        colors = sns.color_palette("Set2", len(df["Model"].unique()))  # Consistent palette
+        ax = sns.barplot(
+            data=df, x="Class", y=metric, hue="Model", palette=colors, edgecolor="black", alpha=0.85
+        )
+        plt.title(f"{metric} for {category} by Model", fontsize=14, fontweight="bold")
+        plt.ylim(0, 1)
+        plt.xticks(rotation=45, fontsize=12)
+        plt.yticks(fontsize=12)
+        plt.xlabel("Class", fontsize=12)
+        plt.ylabel(metric, fontsize=12)
+        plt.legend(title="Model", fontsize=11, loc="upper left")
+        plt.grid(axis="y", linestyle="--", alpha=0.6)
+        plt.tight_layout()
+        plt.show()
+    def plot_precision_recall_per_class(self, class_name=None):
+        """
+        Creates a grouped bar chart per class, displaying precision & recall side by side for all models.
+        Ensures a consistent design with plot_per_class_comparison and plot_category_comparison.
+        :param class_name: (str) If provided, only this class will be plotted. If None, all classes will be plotted.
+        """
+        import matplotlib.pyplot as plt
+        import seaborn as sns
+        import numpy as np
+        sns.set_style("whitegrid")
+        # Determine which classes to plot
+        if class_name:
+            unique_classes = [class_name]
+        else:
+            unique_classes = self.combined_df["Class"].unique()
+        models = self.combined_df["Model"].unique()
+        num_models = len(models)
+        bar_width = 0.35  # Standardized width for better readability
+        spacing = 0       # No extra spacing to match other plots
+        colors = sns.color_palette("Set2", num_models)  # Consistent color palette
+        for class_name in unique_classes:
+            df_class = self.combined_df[self.combined_df["Class"] == class_name]
+            if df_class.empty:
+                print(f"No data available for class: {class_name}")
+                continue
+            plt.figure(figsize=(12, 6))
+            metrics = ["Precision", "Recall"]
+            x_indices = np.arange(len(metrics))  # X positions for metrics
+            for i, model in enumerate(models):
+                df_model = df_class[df_class["Model"] == model]
+                if df_model.empty:
+                    continue
+                precision = df_model["Precision"].values[0]
+                recall = df_model["Recall"].values[0]
+                # Plot bars for Precision and Recall with consistent style
+                plt.bar(
+                    x_indices + (i * bar_width),  # No spacing, perfectly aligned
+                    [precision, recall],
+                    width=bar_width,
+                    label=model,
+                    color=colors[i],
+                    alpha=0.85,
+                    edgecolor="black"  # Matches the other plots
+                )
+            plt.xlabel("Metric", fontsize=12)
+            plt.ylabel("Score", fontsize=12)
+            plt.title(f"Precision & Recall for Class: {class_name}", fontsize=14, fontweight="bold")
+            # Adjust x-tick positions to align properly
+            plt.xticks(x_indices + ((bar_width * (num_models - 1)) / 2), metrics, fontsize=12)
+            plt.ylim(0, 1)
+            plt.legend(title="Model", fontsize=11, loc="upper left")
+            plt.grid(axis="y", linestyle="--", alpha=0.6)
+            plt.tight_layout()
+            plt.show()
+    def plot_recall_trends(self, selected_models=None):
+        """
+        Plot recall trends per class across different models, sorted by recall values in descending order.
+        :param selected_models: List of model names to include in the plot. If None, all models in the dataset will be used.
+        """
+        import matplotlib.pyplot as plt
+        import seaborn as sns
+        import numpy as np
+        sns.set_style("whitegrid")
+        # If no specific models are provided, use all available models in the dataset
+        if selected_models is None:
+            selected_models = self.combined_df["Model"].unique().tolist()
+        # Filter dataset to include only selected models
+        df_filtered = self.combined_df[self.combined_df["Model"].isin(selected_models)]
+        df_filtered_no_overall = df_filtered[~df_filtered["Class"].str.contains("Overall")]
+        # Sort by recall values in descending order
+        df_sorted = df_filtered_no_overall.sort_values(by="Recall", ascending=False)
+        plt.figure(figsize=(12, 6))
+        unique_classes = df_sorted["Class"].unique()
+        # Define colors for models
+        colors = dict(zip(selected_models, sns.color_palette("Set2", len(selected_models))))
+        # Connect corresponding classes across models with thin lines (drawn first)
+        for class_name in unique_classes:
+            class_data = df_sorted[df_sorted["Class"] == class_name]
+            if len(class_data) > 1:
+                plt.plot(
+                    class_data["Class"], class_data["Recall"],
+                    linestyle="-", alpha=0.5, color="gray", linewidth=1.5, zorder=1
+                )
+        # Plot scatter points **after** lines to ensure they are on top
+        for model in selected_models:
+            model_data = df_sorted[df_sorted["Model"] == model]
+            plt.scatter(
+                model_data["Class"], model_data["Recall"],
+                label=model, color=colors[model], edgecolor="black", s=120, alpha=1.0, zorder=2
+            )
+        plt.xlabel("Class", fontsize=12)
+        plt.ylabel("Recall", fontsize=12)
+        plt.xticks(rotation=45, ha="right", fontsize=12)
+        plt.yticks(fontsize=12)
+        plt.title("Recall per Class for Selected Models (Sorted by Recall)", fontsize=14, fontweight="bold")
+        # Move legend to the right
+        plt.legend(title="Model", fontsize=11, loc="upper right", bbox_to_anchor=(1.15, 1))
+        plt.grid(axis="y", linestyle="--", alpha=0.6)
+        plt.tight_layout()
+        plt.show()
+    def plot_metric(self, metric_name, figsize=(10, None), bar_height=0.8, palette="Set2", bar_spacing=0):
+        """
+        Creates a hierarchical visualization of metrics with category headers,
+        sorted by category-average descending. Ensures slight separation between model bars.
+        """
+        colors = sns.color_palette(palette, len(self.evaluators))
+        models = list(self.combined_df["Model"].unique())
+        df = self.combined_df.copy()
+        df = df.drop_duplicates(subset=['Category', 'Class', 'Model', metric_name])
+        # Calculate average support per class
+        avg_support = df.groupby(['Category', 'Class'])['Support'].mean().round().astype(int)
+        # Function to safely retrieve metric values
+        def safe_get_value(model, category, class_name):
+            mask = (
+                (df['Model'] == model) &
+                (df['Category'] == category) &
+                (df['Class'] == class_name)
+            )
+            values = df.loc[mask, metric_name]
+            return values.iloc[0] if not values.empty else np.nan
+        # Calculate category averages, excluding 'Global', and sort descending
+        df_no_global = df[df['Category'] != 'Global']
+        cat_avgs = df_no_global.groupby('Category', observed=False)[metric_name].mean()
+        cat_avgs = cat_avgs.sort_values(ascending=False)
+        categories_ordered = list(cat_avgs.index)
+        if 'Global' in df['Category'].unique():
+            categories_ordered.append('Global')
+        plot_data = []
+        yticks = []
+        ylabels = []
+        y_pos = 0
+        category_positions = {}
+        # Process each category and its classes
+        for category in categories_ordered:
+            if category == 'Global':
+                continue
+            category_data = df[df['Category'] == category]
+            overall_class_name = f"Overall ({category})"
+            mask_overall = category_data['Class'] == overall_class_name
+            category_data_overall = category_data[mask_overall]
+            category_data_regular = category_data[~mask_overall]
+            if not category_data_regular.empty:
+                class_means = category_data_regular.groupby('Class')[metric_name].mean()
+                class_means = class_means.sort_values(ascending=False)
+                sorted_regular_classes = list(class_means.index)
+            else:
+                sorted_regular_classes = []
+            # Add category header
+            category_start = y_pos
+            yticks.append(y_pos)
+            ylabels.append(category.upper())
+            y_pos += 1
+            # Add regular classes
+            for class_name in sorted_regular_classes:
+                values = {model: safe_get_value(model, category, class_name) for model in models}
+                if any(not np.isnan(v) for v in values.values()):
+                    plot_data.append({
+                        'category': category,
+                        'label': class_name,
+                        'y_pos': y_pos,
+                        'values': values,
+                        'is_category': False
+                    })
+                    support = avg_support.get((category, class_name), 0)
+                    yticks.append(y_pos)
+                    ylabels.append(f"  {class_name} (n={support:,})")
+                    y_pos += 1
+            # Add overall class if exists
+            if not category_data_overall.empty:
+                values = {model: safe_get_value(model, category, overall_class_name) for model in models}
+                if any(not np.isnan(v) for v in values.values()):
+                    plot_data.append({
+                        'category': category,
+                        'label': overall_class_name,
+                        'y_pos': y_pos,
+                        'values': values,
+                        'is_category': False
+                    })
+                    support = avg_support.get((category, overall_class_name), 0)
+                    yticks.append(y_pos)
+                    ylabels.append(f"  {overall_class_name} (n={support:,})")
+                    y_pos += 1
+            category_positions[category] = {
+                'start': category_start,
+                'end': y_pos - 1
+            }
+            y_pos += 0.5  # Spacing between categories
+        # Calculate dynamic figure height based on number of items
+        total_items = len(plot_data) + len(categories_ordered)
+        dynamic_height = max(6, total_items * 0.4)
+        if figsize[1] is None:
+            figsize = (figsize[0], dynamic_height)
+        # Plot the bars
+        bar_width = bar_height / len(models)  # No extra spacing
+        fig, ax = plt.subplots(figsize=figsize)
+        for category in categories_ordered:
+            if category == 'Global':
+                continue
+            cat_start = category_positions[category]['start'] - 0.4
+            cat_end = category_positions[category]['end'] + 0.4
+            ax.axhspan(cat_start, cat_end, color='lightgray', alpha=0.2, zorder=0)
+        for i, (model, color) in enumerate(zip(models, colors)):
+            positions = []
+            values = []
+            for item in plot_data:
+                if not item.get('is_category', False):
+                    positions.append(item['y_pos'] + (i - len(models)/2) * bar_width)
+                    values.append(item['values'].get(model, np.nan))
+            ax.barh(
+                positions, values, height=bar_width,
+                label=model, color=color, alpha=0.85, edgecolor="black"
+            )
+        # Main title
+        ax.set_title(f'{metric_name} Comparison Across Models', fontsize=16, fontweight='bold', pad=20)
+        # Adjust axis labels and formatting
+        ax.set_yticks(yticks)
+        ax.set_yticklabels(ylabels, fontsize=10)
+        ax.set_xlabel(metric_name, fontsize=12)
+        ax.grid(True, axis='x', linestyle="--", alpha=0.7)
+        # Invert y-axis to align properly
+        ax.invert_yaxis()
+        plt.legend(title="Model", bbox_to_anchor=(1.05, 1), loc='upper left')
+        # Adjust layout with tighter margins
+        plt.subplots_adjust(left=0.25, right=0.8, top=0.95, bottom=0.1)
+        plt.tight_layout()
+        return fig
+    def plot_precision_recall_for_category(self, category, palette="Set2"):
+        """
+        Creates a modernized Precision-Recall scatter plot for each class within a given category.
+        """
+        import matplotlib.pyplot as plt
+        import seaborn as sns
+        import math
+        import numpy as np
+        # Set modern style
+        plt.rcParams['font.size'] = 12
+        # Filter data for the selected category
+        df = self.combined_df[self.combined_df["Category"] == category].copy()
+        if df.empty:
+            print(f"No data available for category: {category}")
+            return None
+        # Remove overall category-level rows
+        class_data = df[~df["Class"].str.contains("Overall")]
+        # Get unique models and classes
+        models = df["Model"].unique()
+        colors = dict(zip(models, sns.color_palette(palette, len(models))))
+        classes = sorted(class_data["Class"].unique())
+        # Determine grid size
+        cols = 2
+        rows = math.ceil(len(classes) / cols)
+        # Create figure with adjusted size
+        fig, axes = plt.subplots(rows, cols, figsize=(16, rows * 6))
+        # Set global title with better spacing
+        fig.suptitle(f'Precision-Recall Analysis for {category}',
+                     fontsize=20, fontweight='bold', y=1.02)
+        # Iterate over classes and create scatter plots
+        for i, class_name in enumerate(classes):
+            row, col = divmod(i, cols)
+            ax = axes[row, col] if rows > 1 else axes[col]  # Ensure indexing works for 1-row cases
+            # Create scatter plot
+            class_subset = class_data[class_data["Class"] == class_name]
+            sns.scatterplot(
+                data=class_subset,
+                x="Precision",
+                y="Recall",
+                hue="Model",
+                palette=colors,
+                ax=ax,
+                s=200,
+                alpha=0.85,
+                edgecolor="black"
+            )
+            # Add labels with lines for each point
+            for idx, row in class_subset.iterrows():
+                ax.annotate(
+                    row["Model"],
+                    (row["Precision"], row["Recall"]),
+                    xytext=(8, 8), textcoords='offset points',  # Adjusted to reduce overlap
+                    bbox=dict(facecolor='white', alpha=0.7),
+                    arrowprops=dict(
+                        arrowstyle='->',
+                        connectionstyle='arc3,rad=0.2',
+                        color='black'
+                    )
+                )
+            ax.set_title(f"Class: {class_name}", fontsize=16, fontweight="bold", pad=20)
+            ax.set_xlim(-0.05, 1.05)
+            ax.set_ylim(-0.05, 1.05)
+            ax.grid(True, linestyle="--", alpha=0.5)
+            ax.set_aspect("equal", adjustable="box")
+            # Remove legend as we now have direct labels
+            ax.get_legend().remove()
+            # Add labels
+            ax.set_xlabel("Precision", fontsize=14)
+            ax.set_ylabel("Recall", fontsize=14)
+        # Remove empty subplots if classes < grid size
+        for j in range(i + 1, rows * cols):
+            fig.delaxes(axes.flatten()[j])
+        # Adjust layout with better spacing
+        fig.subplots_adjust(top=0.92, bottom=0.08, left=0.08, right=0.92, hspace=0.35, wspace=0.3)
+        return fig
+    def plot_normalized_radar_chart(self, metric_name="F1 Score", exclude_categories=None, figsize=(12, 10), palette="Set2"):
+        """
+        Create a normalized radar chart comparing performance across different categories.
+        Each vertex is normalized independently based on its maximum value.
+        """
+        import numpy as np
+        import matplotlib.pyplot as plt
+        import seaborn as sns
+        from matplotlib.patches import Circle
+        sns.set_style("whitegrid")
+        # Copy data and filter exclusions
+        df = self.combined_df.copy()
+        if exclude_categories:
+            df = df[~df["Category"].isin(exclude_categories)]
+        # Get unique categories and models
+        categories = sorted(df["Category"].unique())
+        models = sorted(df["Model"].unique())
+        # Define colors for models
+        colors = dict(zip(models, sns.color_palette(palette, len(models))))
+        # Create figure
+        fig = plt.figure(figsize=figsize)
+        ax = plt.subplot(111, polar=True)
+        # Add subtle background circles
+        for radius in np.linspace(0, 1, 5):
+            circle = Circle((0, 0), radius, transform=ax.transData._b,
+                           fill=True, color='gray', alpha=0.03)
+            ax.add_artist(circle)
+        # Number of categories and angles
+        N = len(categories)
+        angles = np.linspace(0, 2 * np.pi, N, endpoint=False).tolist()
+        angles += angles[:1]  # Close the circle
+        # Get max values for each category
+        df_overall = df[df["Class"].str.contains("Overall")]
+        max_values = df_overall.groupby("Category")[metric_name].max().to_dict()
+        # Store normalized values for all models
+        normalized_values = {}
+        # Normalize values for each model
+        for model in models:
+            values = []
+            for cat in categories:
+                val = df_overall[(df_overall["Model"] == model) &
+                               (df_overall["Category"] == cat)][metric_name].values
+                val = val[0] if len(val) > 0 else 0
+                norm_val = val / max_values[cat] if max_values[cat] > 0 else 0
+                values.append(norm_val)
+            normalized_values[model] = values + [values[0]]
+        # Plot each model with improved styling
+        for model, values in normalized_values.items():
+            color = colors[model]
+            # Add filled area with gradient
+            ax.fill(angles, values, color=color, alpha=0.15,
+                    edgecolor=color, linewidth=0.5)
+            # Add main line
+            ax.plot(angles, values,
+                    linewidth=2.5, linestyle='solid',
+                    label=model, color=color, alpha=0.85,
+                    zorder=5)
+        # Adjust axis settings
+        ax.set_theta_offset(np.pi / 2)
+        ax.set_theta_direction(-1)
+        ax.set_yticklabels([])
+        # Draw category labels
+        ax.set_thetagrids(np.degrees(angles[:-1]), categories,
+                          fontsize=12, fontweight="bold")
+        # Add scale labels with improved positioning
+        for i, (category, angle) in enumerate(zip(categories, angles[:-1])):
+            max_val = max_values[category]
+            scales = np.linspace(0, max_val, 5)
+            for j, scale in enumerate(scales):
+                radius = j/4
+                # Skip zero as we'll add it separately in the center
+                if radius > 0:
+                    ha = 'center'
+                    va = 'center'
+                    ax.text(angle, radius, f'{scale:.2f}',
+                            ha=ha, va=va,
+                            color='gray', fontsize=9, fontweight='bold')
+        # Add centered zero
+        ax.text(0, 0, '0.00',
+                ha='center', va='center',
+                color='gray', fontsize=9, fontweight='bold')
+        # Customize grid with softer lines
+        ax.grid(True, color='gray', alpha=0.3, linewidth=0.5)
+        ax.yaxis.grid(True, color='gray', alpha=0.3, linewidth=0.5)
+        ax.set_rticks(np.linspace(0, 1, 5))
+        # Add a subtle background color to the entire plot
+        ax.set_facecolor('#f8f9fa')
+        # Set title and legend
+        plt.title(f'Model Performance Radar Chart - {metric_name}',
+                  pad=20, fontsize=14, fontweight="bold")
+        plt.legend(title="Model", fontsize=11,
+                  loc="upper right", bbox_to_anchor=(1.3, 1))
+        # Adjust aspect ratio
+        ax.set_aspect('equal')
+        plt.tight_layout()
+        return fig
+    def transform_to_leaderboard(self):
+        # Remove class-level rows, keeping only category-level rows
+        df = self.combined_df.copy()
+        df = df[~df['Class'].str.contains('Overall', na=False)]
+        # Pivot the table so that each category gets its own set of columns
+        pivoted_df = df.pivot_table(
+            index='Model',
+            columns='Category',
+            values=['Precision', 'Recall', 'F1 Score', 'Accuracy'],
+            aggfunc='mean'  # Take mean in case of multiple entries
+        )
+        # Flatten the multi-level columns
+        pivoted_df.columns = ['_'.join(col).strip() for col in pivoted_df.columns.values]
+        # Calculate the average F1 score across all categories for ranking
+        pivoted_df['Average F1 Score'] = pivoted_df.filter(like='F1 Score').mean(axis=1)
+        # Move 'Average F1 Score' to be the first column after 'Model'
+        pivoted_df = pivoted_df.reset_index()
+        cols = ['Model', 'Average F1 Score'] + [col for col in pivoted_df.columns if col not in ['Model', 'Average F1 Score']]
+        pivoted_df = pivoted_df[cols]
+        # Rank models based on their average F1 Score
+        pivoted_df = pivoted_df.sort_values(by='Average F1 Score', ascending=False).reset_index(drop=True)
+        pivoted_df.insert(0, 'Rank', range(1, len(pivoted_df) + 1))
+        return pivoted_df

nexar_logo.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+streamlit
+pandas
+plotly
+seaborn
+scikit-learn
+matplotlib

results/GPT-4o.csv ADDED Viewed