Spaces:

Writer
/

Financial_LLM_Performance_Leaderboard

Running

File size: 11,674 Bytes

import gradio as gr
import pandas as pd
import numpy as np

# Data for Table 1: Robustness Results (unchanged)
robustness_data = {
    "Model Name": [
        "Gemini 2.0 Flash Exp", "Gemini 1.5 Pro 002", "OpenAI GPT-4o", "OpenAI o1", "OpenAI o3-mini",
        "DeepSeek-R1-Distill-Llama-8B", "DeepSeek-R1-Distill-Qwen-14B", "DeepSeek-R1-Distill-Qwen-32B",
        "DeepSeek-R1-Distill-Llama-70B", "DeepSeek-R1", "Meta-Llama-3.1-8B-Instruct",
        "Meta-Llama-3.1-70B-Instruct", "Meta-Llama-3.3-70B-Instruct", "Qwen2.5-7B-Instruct",
        "Qwen2.5-14B-Instruct", "Qwen2.5-32B-Instruct", "Qwen2.5-72B-Instruct", "Qwen2.5-7B-Instruct-1M",
        "Qwen2.5-14B-Instruct-1M", "Nemotron-70B-Instruct-HF", "Phi-3-mini-128k-Instruct",
        "Phi-3-small-128k-Instruct", "Phi-3-medium-128k-Instruct", "Palmyra-Fin-128k-Instruct"
    ],
    "Baseline": [0.95, 0.96, 0.95, 0.97, 0.98, 0.83, 0.95, 0.95, 0.96, 0.94, 0.91, 0.94, 0.95, 0.92, 0.95, 0.95, 0.94, 0.91, 0.95, 0.94, 0.86, 0.88, 0.89, 0.96],
    "Misspelled (Δ)": ["0.95 (0.0)", "0.95 (0.0)", "0.94 (↓0.01)", "0.95 (↓0.02)", "0.96 (↓0.02)", "0.85 (↑0.02)", "0.90 (↓0.05)", "0.97 (↑0.02)", "0.97 (↑0.01)", "0.94 (0.0)", "0.90 (↓0.01)", "0.92 (↓0.02)", "0.92 (↓0.03)", "0.91 (↓0.01)", "0.94 (↓0.01)", "0.94 (0.0)", "0.94 (0.0)", "0.91 (0.0)", "0.92 (↓0.03)", "0.94 (0.0)", "0.85 (↓0.01)", "0.84 (↓0.04)", "0.84 (↓0.05)", "0.93 (↓0.03)"],
    "Incomplete (Δ)": ["0.95 (0.0)", "0.94 (↓0.02)", "0.94 (↓0.01)", "0.94 (↓0.03)", "0.96 (↓0.02)", "0.82 (↓0.01)", "0.92 (↓0.03)", "0.95 (0.0)", "0.95 (↓0.01)", "0.93 (↓0.01)", "0.86 (↓0.05)", "0.94 (0.0)", "0.93 (↓0.02)", "0.90 (↓0.02)", "0.94 (↓0.01)", "0.93 (↓0.02)", "0.93 (↓0.01)", "0.91 (0.0)", "0.91 (↓0.04)", "0.93 (↓0.01)", "0.78 (↓0.08)", "0.78 (↓0.10)", "0.84 (↓0.05)", "0.92 (↓0.04)"],
    "Out-of-Domain (Δ)": ["0.88 (↓0.07)", "0.92 (↓0.04)", "0.92 (↓0.03)", "0.89 (↓0.08)", "0.95 (↓0.03)", "0.87 (↑0.04)", "0.93 (↓0.02)", "0.92 (↓0.03)", "0.94 (↓0.02)", "0.91 (↓0.03)", "0.82 (↓0.09)", "0.87 (↓0.07)", "0.90 (↓0.05)", "0.85 (↓0.07)", "0.94 (↓0.01)", "0.92 (↓0.03)", "0.92 (↓0.02)", "0.86 (↓0.05)", "0.91 (↓0.04)", "0.90 (↓0.04)", "0.79 (↓0.07)", "0.83 (↓0.05)", "0.81 (↓0.08)", "0.90 (↓0.06)"],
    "OCR Context (Δ)": ["0.91 (↓0.04)", "0.92 (↓0.04)", "0.95 (0.0)", "0.94 (↓0.03)", "0.90 (↓0.08)", "0.72 (↓0.11)", "0.86 (↓0.09)", "0.89 (↓0.06)", "0.93 (↓0.03)", "0.88 (↓0.06)", "0.80 (↓0.11)", "0.88 (↓0.06)", "0.89 (↓0.06)", "0.80 (↓0.12)", "0.88 (↓0.07)", "0.92 (↓0.03)", "0.91 (↓0.03)", "0.77 (↓0.14)", "0.89 (↓0.06)", "0.91 (↓0.03)", "0.69 (↓0.17)", "0.78 (↓0.10)", "0.72 (↓0.17)", "0.89 (↓0.07)"],
    "Robustness (Δ)": ["0.83 (↓0.12)", "0.84 (↓0.12)", "0.85 (↓0.10)", "0.81 (↓0.16)", "0.90 (↓0.08)", "0.64 (↓0.19)", "0.82 (↓0.13)", "0.86 (↓0.09)", "0.89 (↓0.07)", "0.80 (↓0.14)", "0.70 (↓0.21)", "0.80 (↓0.14)", "0.82 (↓0.13)", "0.75 (↓0.17)", "0.86 (↓0.09)", "0.85 (↓0.10)", "0.84 (↓0.10)", "0.74 (↓0.17)", "0.80 (↓0.15)", "0.82 (↓0.12)", "0.58 (↓0.28)", "0.70 (↓0.18)", "0.63 (↓0.26)", "0.83 (↓0.13)"]
}

# Data for Table 2: Context Grounding Results (unchanged)
context_grounding_data = {
    "Model Name": [
        "Gemini 2.0 Flash Exp", "Gemini 1.5 Pro 002", "OpenAI GPT-4o", "OpenAI o1", "OpenAI o3-mini",
        "DeepSeek-R1-Distill-Llama-8B", "DeepSeek-R1-Distill-Qwen-14B", "DeepSeek-R1-Distill-Qwen-32B",
        "DeepSeek-R1-Distill-Llama-70B", "DeepSeek-R1", "Meta-Llama-3.1-8B-Instruct",
        "Meta-Llama-3.1-70B-Instruct", "Meta-Llama-3.3-70B-Instruct", "Qwen2.5-7B-Instruct",
        "Qwen2.5-14B-Instruct", "Qwen2.5-32B-Instruct", "Qwen2.5-72B-Instruct", "Qwen2.5-7B-Instruct-1M",
        "Qwen2.5-14B-Instruct-1M", "Nemotron-70B-Instruct-HF", "Phi-3-mini-128k-Instruct",
        "Phi-3-small-128k-Instruct", "Phi-3-medium-128k-Instruct", "Palmyra-Fin-128k-Instruct"
    ],
    "Irrelevant Ctx": [0.81, 0.74, 0.52, 0.56, 0.67, 0.32, 0.49, 0.54, 0.50, 0.51, 0.67, 0.46, 0.50, 0.75, 0.75, 0.89, 0.69, 0.63, 0.78, 0.52, 0.54, 0.37, 0.36, 0.95],
    "No Ctx": [0.66, 0.64, 0.43, 0.55, 0.51, 0.27, 0.21, 0.24, 0.27, 0.22, 0.63, 0.37, 0.40, 0.64, 0.61, 0.68, 0.60, 0.58, 0.53, 0.48, 0.34, 0.26, 0.25, 0.66],
    "Ctx Grounding QA": [0.77, 0.72, 0.50, 0.57, 0.63, 0.30, 0.36, 0.40, 0.41, 0.39, 0.70, 0.48, 0.47, 0.75, 0.70, 0.82, 0.68, 0.65, 0.69, 0.52, 0.47, 0.34, 0.33, 0.83],
    "Ctx Grounding TG": [0.46, 0.52, 0.25, 0.45, 0.27, 0.25, 0.27, 0.35, 0.22, 0.20, 0.27, 0.37, 0.31, 0.31, 0.55, 0.55, 0.39, 0.29, 0.37, 0.39, 0.24, 0.10, 0.14, 0.65],
    "Ctx Grounding": [0.74, 0.69, 0.47, 0.55, 0.59, 0.30, 0.35, 0.39, 0.38, 0.37, 0.65, 0.47, 0.45, 0.70, 0.68, 0.79, 0.64, 0.60, 0.65, 0.50, 0.44, 0.31, 0.30, 0.80],
    "Robustness": [0.83, 0.84, 0.85, 0.81, 0.90, 0.64, 0.82, 0.86, 0.89, 0.80, 0.70, 0.80, 0.82, 0.75, 0.86, 0.85, 0.84, 0.74, 0.80, 0.82, 0.58, 0.70, 0.63, 0.83],
    "Compliance": [0.76, 0.72, 0.52, 0.59, 0.63, 0.34, 0.40, 0.44, 0.43, 0.41, 0.66, 0.51, 0.49, 0.71, 0.71, 0.80, 0.67, 0.62, 0.68, 0.54, 0.46, 0.35, 0.34, 0.81]
}

# Function to bold the highest score per column (excluding "Model Name")
def format_table(df):
    styled_df = df.copy()
    numeric_columns = [col for col in df.columns if col != "Model Name"]
    
    for col in numeric_columns:
        if col in ["Baseline", "Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"]:
            # Convert string values (e.g., "0.95 (0.0)") to float for comparison, or use direct float values
            if any(" (" in str(x) for x in df[col]):
                # Handle string values with deltas (e.g., "0.95 (0.0)")
                values = [float(str(x).split(" (")[0]) for x in df[col]]
            else:
                # Handle direct float values
                values = df[col].astype(float)
            
            max_value = np.max(values)
            styled_df[col] = df[col].apply(lambda x: f"**{x}**" if (float(str(x).split(" (")[0]) if " (" in str(x) else float(x)) == max_value else x)
    
    return styled_df

# Function to calculate top 3 models based on combined score (average of numeric columns)
def get_top_3_models(robustness_df, context_grounding_df):
    # Combine numeric columns from both datasets
    numeric_cols_robustness = ["Baseline", "Robustness (Δ)"]  # Columns with numeric or string-numeric data
    numeric_cols_context = ["Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"]  # From context grounding
    
    # Extract numeric values for each column in robustness_df
    robustness_scores = pd.DataFrame()
    for col in numeric_cols_robustness:
        if any(" (" in str(x) for x in robustness_df[col]):
            # Handle string values with deltas (e.g., "0.95 (0.0)")
            robustness_scores[col] = robustness_df[col].apply(lambda x: float(str(x).split(" (")[0]) if " (" in str(x) else float(x))
        else:
            # Handle direct float values
            robustness_scores[col] = robustness_df[col].astype(float)
    
    # Extract numeric values for context_grounding_df (all are already float values)
    context_scores = context_grounding_df[numeric_cols_context].astype(float)
    
    # Combine scores by averaging
    combined_scores = (robustness_scores.mean(axis=1) + context_scores.mean(axis=1)) / 2
    
    # Add combined scores to a DataFrame for sorting
    combined_df = pd.DataFrame({
        "Model Name": robustness_df["Model Name"],
        "Combined Score": combined_scores
    })
    
    # Sort by combined score in descending order and get top 3
    top_3 = combined_df.sort_values(by="Combined Score", ascending=False).head(3)
    
    # Format the winners table
    winners_df = pd.DataFrame({
        "Rank": [1, 2, 3],
        "Model Name": top_3["Model Name"],
        "Combined Score": top_3["Combined Score"].round(3)
    })
    
    return winners_df

# Function to create the Gradio interface
def create_leaderboard():
    # Convert data to DataFrames
    robustness_df = pd.DataFrame(robustness_data)
    context_grounding_df = pd.DataFrame(context_grounding_data)

    # Format tables to bold highest scores
    robustness_df = format_table(robustness_df)
    context_grounding_df = format_table(context_grounding_df)

    # Get top 3 winners
    winners_df = get_top_3_models(robustness_df, context_grounding_df)

    # Create Gradio interface with a nice theme
    with gr.Blocks(theme=gr.themes.Soft(), title="Financial Model Performance Leaderboard") as demo:
        gr.Markdown("# Financial Model Performance Leaderboard")
        
        with gr.Row():
            with gr.Column():
                with gr.Tab("Robustness Results"):
                    gr.DataFrame(
                        value=robustness_df,
                        label="Robustness Results",
                        wrap=True,
                        elem_classes=["custom-table"]
                    )
                with gr.Tab("Context Grounding Results"):
                    gr.DataFrame(
                        value=context_grounding_df,
                        label="Context Grounding Results",
                        wrap=True,
                        elem_classes=["custom-table"]
                    )
                with gr.Tab("Top 3 Winners"):
                    gr.DataFrame(
                        value=winners_df,
                        label="Top 3 Models",
                        wrap=True,
                        elem_classes=["custom-table"]
                    )
                with gr.Tab("About"):
                    gr.HTML("""
                        <div style="padding: 20px;">
                            <h2>About This Leaderboard</h2>
                            <p>This Financial Model Performance Leaderboard compares the performance of various AI models across robustness and context grounding metrics. The data is sourced from evaluations conducted on February 18, 2025, and reflects the models' ability to handle financial tasks under different conditions.</p>
                            <p>For more information, contact us at <a href="mailto:[email protected]">[email protected]</a>.</p>
                        </div>
                    """)
                    with gr.Row():
                        submit_btn = gr.Button("Submit Feedback")
                        output = gr.Textbox(label="Feedback Submission Status", placeholder="Your feedback will appear here...")
                    
                    def submit_feedback():
                        return "Thank you for your feedback!"

                    submit_btn.click(fn=submit_feedback, inputs=None, outputs=output)

    # Custom CSS for better table appearance (larger font, spacing, and height)
    demo.css = """
    .custom-table {
        font-size: 16px;  /* Increase font size for readability */
        line-height: 2;   /* Increase line height for longer rows */
        max-height: 600px; /* Set maximum height for scrolling if needed */
        overflow-y: auto;  /* Enable vertical scrolling if content exceeds height */
        border-collapse: collapse;
    }
    .custom-table th, .custom-table td {
        padding: 12px;    /* Increase padding for spacing */
        border: 1px solid #ddd;
    }
    .custom-table th {
        background-color: #f5f5f5;
        font-weight: bold;
    }
    """

    return demo

# Launch the Gradio app
if __name__ == "__main__":
    demo = create_leaderboard()
    demo.launch()