File size: 11,674 Bytes
f7f7ef8 33a05be f7f7ef8 e13106d f7f7ef8 e13106d f7f7ef8 33a05be 814faad 7d6243e 814faad 7d6243e 814faad f7f7ef8 33a05be f7f7ef8 33a05be 814faad 33a05be f7f7ef8 33a05be 814faad 33a05be 814faad 33a05be 814faad 33a05be e13106d 33a05be e13106d 33a05be f7f7ef8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
import gradio as gr
import pandas as pd
import numpy as np
# Data for Table 1: Robustness Results (unchanged)
robustness_data = {
"Model Name": [
"Gemini 2.0 Flash Exp", "Gemini 1.5 Pro 002", "OpenAI GPT-4o", "OpenAI o1", "OpenAI o3-mini",
"DeepSeek-R1-Distill-Llama-8B", "DeepSeek-R1-Distill-Qwen-14B", "DeepSeek-R1-Distill-Qwen-32B",
"DeepSeek-R1-Distill-Llama-70B", "DeepSeek-R1", "Meta-Llama-3.1-8B-Instruct",
"Meta-Llama-3.1-70B-Instruct", "Meta-Llama-3.3-70B-Instruct", "Qwen2.5-7B-Instruct",
"Qwen2.5-14B-Instruct", "Qwen2.5-32B-Instruct", "Qwen2.5-72B-Instruct", "Qwen2.5-7B-Instruct-1M",
"Qwen2.5-14B-Instruct-1M", "Nemotron-70B-Instruct-HF", "Phi-3-mini-128k-Instruct",
"Phi-3-small-128k-Instruct", "Phi-3-medium-128k-Instruct", "Palmyra-Fin-128k-Instruct"
],
"Baseline": [0.95, 0.96, 0.95, 0.97, 0.98, 0.83, 0.95, 0.95, 0.96, 0.94, 0.91, 0.94, 0.95, 0.92, 0.95, 0.95, 0.94, 0.91, 0.95, 0.94, 0.86, 0.88, 0.89, 0.96],
"Misspelled (Ξ)": ["0.95 (0.0)", "0.95 (0.0)", "0.94 (β0.01)", "0.95 (β0.02)", "0.96 (β0.02)", "0.85 (β0.02)", "0.90 (β0.05)", "0.97 (β0.02)", "0.97 (β0.01)", "0.94 (0.0)", "0.90 (β0.01)", "0.92 (β0.02)", "0.92 (β0.03)", "0.91 (β0.01)", "0.94 (β0.01)", "0.94 (0.0)", "0.94 (0.0)", "0.91 (0.0)", "0.92 (β0.03)", "0.94 (0.0)", "0.85 (β0.01)", "0.84 (β0.04)", "0.84 (β0.05)", "0.93 (β0.03)"],
"Incomplete (Ξ)": ["0.95 (0.0)", "0.94 (β0.02)", "0.94 (β0.01)", "0.94 (β0.03)", "0.96 (β0.02)", "0.82 (β0.01)", "0.92 (β0.03)", "0.95 (0.0)", "0.95 (β0.01)", "0.93 (β0.01)", "0.86 (β0.05)", "0.94 (0.0)", "0.93 (β0.02)", "0.90 (β0.02)", "0.94 (β0.01)", "0.93 (β0.02)", "0.93 (β0.01)", "0.91 (0.0)", "0.91 (β0.04)", "0.93 (β0.01)", "0.78 (β0.08)", "0.78 (β0.10)", "0.84 (β0.05)", "0.92 (β0.04)"],
"Out-of-Domain (Ξ)": ["0.88 (β0.07)", "0.92 (β0.04)", "0.92 (β0.03)", "0.89 (β0.08)", "0.95 (β0.03)", "0.87 (β0.04)", "0.93 (β0.02)", "0.92 (β0.03)", "0.94 (β0.02)", "0.91 (β0.03)", "0.82 (β0.09)", "0.87 (β0.07)", "0.90 (β0.05)", "0.85 (β0.07)", "0.94 (β0.01)", "0.92 (β0.03)", "0.92 (β0.02)", "0.86 (β0.05)", "0.91 (β0.04)", "0.90 (β0.04)", "0.79 (β0.07)", "0.83 (β0.05)", "0.81 (β0.08)", "0.90 (β0.06)"],
"OCR Context (Ξ)": ["0.91 (β0.04)", "0.92 (β0.04)", "0.95 (0.0)", "0.94 (β0.03)", "0.90 (β0.08)", "0.72 (β0.11)", "0.86 (β0.09)", "0.89 (β0.06)", "0.93 (β0.03)", "0.88 (β0.06)", "0.80 (β0.11)", "0.88 (β0.06)", "0.89 (β0.06)", "0.80 (β0.12)", "0.88 (β0.07)", "0.92 (β0.03)", "0.91 (β0.03)", "0.77 (β0.14)", "0.89 (β0.06)", "0.91 (β0.03)", "0.69 (β0.17)", "0.78 (β0.10)", "0.72 (β0.17)", "0.89 (β0.07)"],
"Robustness (Ξ)": ["0.83 (β0.12)", "0.84 (β0.12)", "0.85 (β0.10)", "0.81 (β0.16)", "0.90 (β0.08)", "0.64 (β0.19)", "0.82 (β0.13)", "0.86 (β0.09)", "0.89 (β0.07)", "0.80 (β0.14)", "0.70 (β0.21)", "0.80 (β0.14)", "0.82 (β0.13)", "0.75 (β0.17)", "0.86 (β0.09)", "0.85 (β0.10)", "0.84 (β0.10)", "0.74 (β0.17)", "0.80 (β0.15)", "0.82 (β0.12)", "0.58 (β0.28)", "0.70 (β0.18)", "0.63 (β0.26)", "0.83 (β0.13)"]
}
# Data for Table 2: Context Grounding Results (unchanged)
context_grounding_data = {
"Model Name": [
"Gemini 2.0 Flash Exp", "Gemini 1.5 Pro 002", "OpenAI GPT-4o", "OpenAI o1", "OpenAI o3-mini",
"DeepSeek-R1-Distill-Llama-8B", "DeepSeek-R1-Distill-Qwen-14B", "DeepSeek-R1-Distill-Qwen-32B",
"DeepSeek-R1-Distill-Llama-70B", "DeepSeek-R1", "Meta-Llama-3.1-8B-Instruct",
"Meta-Llama-3.1-70B-Instruct", "Meta-Llama-3.3-70B-Instruct", "Qwen2.5-7B-Instruct",
"Qwen2.5-14B-Instruct", "Qwen2.5-32B-Instruct", "Qwen2.5-72B-Instruct", "Qwen2.5-7B-Instruct-1M",
"Qwen2.5-14B-Instruct-1M", "Nemotron-70B-Instruct-HF", "Phi-3-mini-128k-Instruct",
"Phi-3-small-128k-Instruct", "Phi-3-medium-128k-Instruct", "Palmyra-Fin-128k-Instruct"
],
"Irrelevant Ctx": [0.81, 0.74, 0.52, 0.56, 0.67, 0.32, 0.49, 0.54, 0.50, 0.51, 0.67, 0.46, 0.50, 0.75, 0.75, 0.89, 0.69, 0.63, 0.78, 0.52, 0.54, 0.37, 0.36, 0.95],
"No Ctx": [0.66, 0.64, 0.43, 0.55, 0.51, 0.27, 0.21, 0.24, 0.27, 0.22, 0.63, 0.37, 0.40, 0.64, 0.61, 0.68, 0.60, 0.58, 0.53, 0.48, 0.34, 0.26, 0.25, 0.66],
"Ctx Grounding QA": [0.77, 0.72, 0.50, 0.57, 0.63, 0.30, 0.36, 0.40, 0.41, 0.39, 0.70, 0.48, 0.47, 0.75, 0.70, 0.82, 0.68, 0.65, 0.69, 0.52, 0.47, 0.34, 0.33, 0.83],
"Ctx Grounding TG": [0.46, 0.52, 0.25, 0.45, 0.27, 0.25, 0.27, 0.35, 0.22, 0.20, 0.27, 0.37, 0.31, 0.31, 0.55, 0.55, 0.39, 0.29, 0.37, 0.39, 0.24, 0.10, 0.14, 0.65],
"Ctx Grounding": [0.74, 0.69, 0.47, 0.55, 0.59, 0.30, 0.35, 0.39, 0.38, 0.37, 0.65, 0.47, 0.45, 0.70, 0.68, 0.79, 0.64, 0.60, 0.65, 0.50, 0.44, 0.31, 0.30, 0.80],
"Robustness": [0.83, 0.84, 0.85, 0.81, 0.90, 0.64, 0.82, 0.86, 0.89, 0.80, 0.70, 0.80, 0.82, 0.75, 0.86, 0.85, 0.84, 0.74, 0.80, 0.82, 0.58, 0.70, 0.63, 0.83],
"Compliance": [0.76, 0.72, 0.52, 0.59, 0.63, 0.34, 0.40, 0.44, 0.43, 0.41, 0.66, 0.51, 0.49, 0.71, 0.71, 0.80, 0.67, 0.62, 0.68, 0.54, 0.46, 0.35, 0.34, 0.81]
}
# Function to bold the highest score per column (excluding "Model Name")
def format_table(df):
styled_df = df.copy()
numeric_columns = [col for col in df.columns if col != "Model Name"]
for col in numeric_columns:
if col in ["Baseline", "Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"]:
# Convert string values (e.g., "0.95 (0.0)") to float for comparison, or use direct float values
if any(" (" in str(x) for x in df[col]):
# Handle string values with deltas (e.g., "0.95 (0.0)")
values = [float(str(x).split(" (")[0]) for x in df[col]]
else:
# Handle direct float values
values = df[col].astype(float)
max_value = np.max(values)
styled_df[col] = df[col].apply(lambda x: f"**{x}**" if (float(str(x).split(" (")[0]) if " (" in str(x) else float(x)) == max_value else x)
return styled_df
# Function to calculate top 3 models based on combined score (average of numeric columns)
def get_top_3_models(robustness_df, context_grounding_df):
# Combine numeric columns from both datasets
numeric_cols_robustness = ["Baseline", "Robustness (Ξ)"] # Columns with numeric or string-numeric data
numeric_cols_context = ["Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"] # From context grounding
# Extract numeric values for each column in robustness_df
robustness_scores = pd.DataFrame()
for col in numeric_cols_robustness:
if any(" (" in str(x) for x in robustness_df[col]):
# Handle string values with deltas (e.g., "0.95 (0.0)")
robustness_scores[col] = robustness_df[col].apply(lambda x: float(str(x).split(" (")[0]) if " (" in str(x) else float(x))
else:
# Handle direct float values
robustness_scores[col] = robustness_df[col].astype(float)
# Extract numeric values for context_grounding_df (all are already float values)
context_scores = context_grounding_df[numeric_cols_context].astype(float)
# Combine scores by averaging
combined_scores = (robustness_scores.mean(axis=1) + context_scores.mean(axis=1)) / 2
# Add combined scores to a DataFrame for sorting
combined_df = pd.DataFrame({
"Model Name": robustness_df["Model Name"],
"Combined Score": combined_scores
})
# Sort by combined score in descending order and get top 3
top_3 = combined_df.sort_values(by="Combined Score", ascending=False).head(3)
# Format the winners table
winners_df = pd.DataFrame({
"Rank": [1, 2, 3],
"Model Name": top_3["Model Name"],
"Combined Score": top_3["Combined Score"].round(3)
})
return winners_df
# Function to create the Gradio interface
def create_leaderboard():
# Convert data to DataFrames
robustness_df = pd.DataFrame(robustness_data)
context_grounding_df = pd.DataFrame(context_grounding_data)
# Format tables to bold highest scores
robustness_df = format_table(robustness_df)
context_grounding_df = format_table(context_grounding_df)
# Get top 3 winners
winners_df = get_top_3_models(robustness_df, context_grounding_df)
# Create Gradio interface with a nice theme
with gr.Blocks(theme=gr.themes.Soft(), title="Financial Model Performance Leaderboard") as demo:
gr.Markdown("# Financial Model Performance Leaderboard")
with gr.Row():
with gr.Column():
with gr.Tab("Robustness Results"):
gr.DataFrame(
value=robustness_df,
label="Robustness Results",
wrap=True,
elem_classes=["custom-table"]
)
with gr.Tab("Context Grounding Results"):
gr.DataFrame(
value=context_grounding_df,
label="Context Grounding Results",
wrap=True,
elem_classes=["custom-table"]
)
with gr.Tab("Top 3 Winners"):
gr.DataFrame(
value=winners_df,
label="Top 3 Models",
wrap=True,
elem_classes=["custom-table"]
)
with gr.Tab("About"):
gr.HTML("""
<div style="padding: 20px;">
<h2>About This Leaderboard</h2>
<p>This Financial Model Performance Leaderboard compares the performance of various AI models across robustness and context grounding metrics. The data is sourced from evaluations conducted on February 18, 2025, and reflects the models' ability to handle financial tasks under different conditions.</p>
<p>For more information, contact us at <a href="mailto:[email protected]">[email protected]</a>.</p>
</div>
""")
with gr.Row():
submit_btn = gr.Button("Submit Feedback")
output = gr.Textbox(label="Feedback Submission Status", placeholder="Your feedback will appear here...")
def submit_feedback():
return "Thank you for your feedback!"
submit_btn.click(fn=submit_feedback, inputs=None, outputs=output)
# Custom CSS for better table appearance (larger font, spacing, and height)
demo.css = """
.custom-table {
font-size: 16px; /* Increase font size for readability */
line-height: 2; /* Increase line height for longer rows */
max-height: 600px; /* Set maximum height for scrolling if needed */
overflow-y: auto; /* Enable vertical scrolling if content exceeds height */
border-collapse: collapse;
}
.custom-table th, .custom-table td {
padding: 12px; /* Increase padding for spacing */
border: 1px solid #ddd;
}
.custom-table th {
background-color: #f5f5f5;
font-weight: bold;
}
"""
return demo
# Launch the Gradio app
if __name__ == "__main__":
demo = create_leaderboard()
demo.launch() |