|
import gradio as gr |
|
import pandas as pd |
|
import numpy as np |
|
|
|
|
|
robustness_data = { |
|
"Model Name": [ |
|
"Gemini 2.0 Flash Exp", "Gemini 1.5 Pro 002", "OpenAI GPT-4o", "OpenAI o1", "OpenAI o3-mini", |
|
"DeepSeek-R1-Distill-Llama-8B", "DeepSeek-R1-Distill-Qwen-14B", "DeepSeek-R1-Distill-Qwen-32B", |
|
"DeepSeek-R1-Distill-Llama-70B", "DeepSeek-R1", "Meta-Llama-3.1-8B-Instruct", |
|
"Meta-Llama-3.1-70B-Instruct", "Meta-Llama-3.3-70B-Instruct", "Qwen2.5-7B-Instruct", |
|
"Qwen2.5-14B-Instruct", "Qwen2.5-32B-Instruct", "Qwen2.5-72B-Instruct", "Qwen2.5-7B-Instruct-1M", |
|
"Qwen2.5-14B-Instruct-1M", "Nemotron-70B-Instruct-HF", "Phi-3-mini-128k-Instruct", |
|
"Phi-3-small-128k-Instruct", "Phi-3-medium-128k-Instruct", "Palmyra-Fin-128k-Instruct" |
|
], |
|
"Baseline": [0.95, 0.96, 0.95, 0.97, 0.98, 0.83, 0.95, 0.95, 0.96, 0.94, 0.91, 0.94, 0.95, 0.92, 0.95, 0.95, 0.94, 0.91, 0.95, 0.94, 0.86, 0.88, 0.89, 0.96], |
|
"Misspelled (Ξ)": ["0.95 (0.0)", "0.95 (0.0)", "0.94 (β0.01)", "0.95 (β0.02)", "0.96 (β0.02)", "0.85 (β0.02)", "0.90 (β0.05)", "0.97 (β0.02)", "0.97 (β0.01)", "0.94 (0.0)", "0.90 (β0.01)", "0.92 (β0.02)", "0.92 (β0.03)", "0.91 (β0.01)", "0.94 (β0.01)", "0.94 (0.0)", "0.94 (0.0)", "0.91 (0.0)", "0.92 (β0.03)", "0.94 (0.0)", "0.85 (β0.01)", "0.84 (β0.04)", "0.84 (β0.05)", "0.93 (β0.03)"], |
|
"Incomplete (Ξ)": ["0.95 (0.0)", "0.94 (β0.02)", "0.94 (β0.01)", "0.94 (β0.03)", "0.96 (β0.02)", "0.82 (β0.01)", "0.92 (β0.03)", "0.95 (0.0)", "0.95 (β0.01)", "0.93 (β0.01)", "0.86 (β0.05)", "0.94 (0.0)", "0.93 (β0.02)", "0.90 (β0.02)", "0.94 (β0.01)", "0.93 (β0.02)", "0.93 (β0.01)", "0.91 (0.0)", "0.91 (β0.04)", "0.93 (β0.01)", "0.78 (β0.08)", "0.78 (β0.10)", "0.84 (β0.05)", "0.92 (β0.04)"], |
|
"Out-of-Domain (Ξ)": ["0.88 (β0.07)", "0.92 (β0.04)", "0.92 (β0.03)", "0.89 (β0.08)", "0.95 (β0.03)", "0.87 (β0.04)", "0.93 (β0.02)", "0.92 (β0.03)", "0.94 (β0.02)", "0.91 (β0.03)", "0.82 (β0.09)", "0.87 (β0.07)", "0.90 (β0.05)", "0.85 (β0.07)", "0.94 (β0.01)", "0.92 (β0.03)", "0.92 (β0.02)", "0.86 (β0.05)", "0.91 (β0.04)", "0.90 (β0.04)", "0.79 (β0.07)", "0.83 (β0.05)", "0.81 (β0.08)", "0.90 (β0.06)"], |
|
"OCR Context (Ξ)": ["0.91 (β0.04)", "0.92 (β0.04)", "0.95 (0.0)", "0.94 (β0.03)", "0.90 (β0.08)", "0.72 (β0.11)", "0.86 (β0.09)", "0.89 (β0.06)", "0.93 (β0.03)", "0.88 (β0.06)", "0.80 (β0.11)", "0.88 (β0.06)", "0.89 (β0.06)", "0.80 (β0.12)", "0.88 (β0.07)", "0.92 (β0.03)", "0.91 (β0.03)", "0.77 (β0.14)", "0.89 (β0.06)", "0.91 (β0.03)", "0.69 (β0.17)", "0.78 (β0.10)", "0.72 (β0.17)", "0.89 (β0.07)"], |
|
"Robustness (Ξ)": ["0.83 (β0.12)", "0.84 (β0.12)", "0.85 (β0.10)", "0.81 (β0.16)", "0.90 (β0.08)", "0.64 (β0.19)", "0.82 (β0.13)", "0.86 (β0.09)", "0.89 (β0.07)", "0.80 (β0.14)", "0.70 (β0.21)", "0.80 (β0.14)", "0.82 (β0.13)", "0.75 (β0.17)", "0.86 (β0.09)", "0.85 (β0.10)", "0.84 (β0.10)", "0.74 (β0.17)", "0.80 (β0.15)", "0.82 (β0.12)", "0.58 (β0.28)", "0.70 (β0.18)", "0.63 (β0.26)", "0.83 (β0.13)"] |
|
} |
|
|
|
|
|
context_grounding_data = { |
|
"Model Name": [ |
|
"Gemini 2.0 Flash Exp", "Gemini 1.5 Pro 002", "OpenAI GPT-4o", "OpenAI o1", "OpenAI o3-mini", |
|
"DeepSeek-R1-Distill-Llama-8B", "DeepSeek-R1-Distill-Qwen-14B", "DeepSeek-R1-Distill-Qwen-32B", |
|
"DeepSeek-R1-Distill-Llama-70B", "DeepSeek-R1", "Meta-Llama-3.1-8B-Instruct", |
|
"Meta-Llama-3.1-70B-Instruct", "Meta-Llama-3.3-70B-Instruct", "Qwen2.5-7B-Instruct", |
|
"Qwen2.5-14B-Instruct", "Qwen2.5-32B-Instruct", "Qwen2.5-72B-Instruct", "Qwen2.5-7B-Instruct-1M", |
|
"Qwen2.5-14B-Instruct-1M", "Nemotron-70B-Instruct-HF", "Phi-3-mini-128k-Instruct", |
|
"Phi-3-small-128k-Instruct", "Phi-3-medium-128k-Instruct", "Palmyra-Fin-128k-Instruct" |
|
], |
|
"Irrelevant Ctx": [0.81, 0.74, 0.52, 0.56, 0.67, 0.32, 0.49, 0.54, 0.50, 0.51, 0.67, 0.46, 0.50, 0.75, 0.75, 0.89, 0.69, 0.63, 0.78, 0.52, 0.54, 0.37, 0.36, 0.95], |
|
"No Ctx": [0.66, 0.64, 0.43, 0.55, 0.51, 0.27, 0.21, 0.24, 0.27, 0.22, 0.63, 0.37, 0.40, 0.64, 0.61, 0.68, 0.60, 0.58, 0.53, 0.48, 0.34, 0.26, 0.25, 0.66], |
|
"Ctx Grounding QA": [0.77, 0.72, 0.50, 0.57, 0.63, 0.30, 0.36, 0.40, 0.41, 0.39, 0.70, 0.48, 0.47, 0.75, 0.70, 0.82, 0.68, 0.65, 0.69, 0.52, 0.47, 0.34, 0.33, 0.83], |
|
"Ctx Grounding TG": [0.46, 0.52, 0.25, 0.45, 0.27, 0.25, 0.27, 0.35, 0.22, 0.20, 0.27, 0.37, 0.31, 0.31, 0.55, 0.55, 0.39, 0.29, 0.37, 0.39, 0.24, 0.10, 0.14, 0.65], |
|
"Ctx Grounding": [0.74, 0.69, 0.47, 0.55, 0.59, 0.30, 0.35, 0.39, 0.38, 0.37, 0.65, 0.47, 0.45, 0.70, 0.68, 0.79, 0.64, 0.60, 0.65, 0.50, 0.44, 0.31, 0.30, 0.80], |
|
"Robustness": [0.83, 0.84, 0.85, 0.81, 0.90, 0.64, 0.82, 0.86, 0.89, 0.80, 0.70, 0.80, 0.82, 0.75, 0.86, 0.85, 0.84, 0.74, 0.80, 0.82, 0.58, 0.70, 0.63, 0.83], |
|
"Compliance": [0.76, 0.72, 0.52, 0.59, 0.63, 0.34, 0.40, 0.44, 0.43, 0.41, 0.66, 0.51, 0.49, 0.71, 0.71, 0.80, 0.67, 0.62, 0.68, 0.54, 0.46, 0.35, 0.34, 0.81] |
|
} |
|
|
|
|
|
def format_table(df): |
|
styled_df = df.copy() |
|
numeric_columns = [col for col in df.columns if col != "Model Name"] |
|
|
|
for col in numeric_columns: |
|
if col in ["Baseline", "Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"]: |
|
|
|
if any(" (" in str(x) for x in df[col]): |
|
|
|
values = [float(str(x).split(" (")[0]) for x in df[col]] |
|
else: |
|
|
|
values = df[col].astype(float) |
|
|
|
max_value = np.max(values) |
|
styled_df[col] = df[col].apply(lambda x: f"**{x}**" if (float(str(x).split(" (")[0]) if " (" in str(x) else float(x)) == max_value else x) |
|
|
|
return styled_df |
|
|
|
|
|
def get_top_3_models(robustness_df, context_grounding_df): |
|
|
|
numeric_cols_robustness = ["Baseline", "Robustness (Ξ)"] |
|
numeric_cols_context = ["Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"] |
|
|
|
|
|
robustness_scores = robustness_df[numeric_cols_robustness].apply(lambda x: float(str(x).split(" (")[0]) if " (" in str(x) else float(x), axis=1) |
|
context_scores = context_grounding_df[numeric_cols_context].astype(float) |
|
|
|
|
|
combined_scores = (robustness_scores.mean(axis=1) + context_scores.mean(axis=1)) / 2 |
|
|
|
|
|
combined_df = pd.DataFrame({ |
|
"Model Name": robustness_df["Model Name"], |
|
"Combined Score": combined_scores |
|
}) |
|
|
|
|
|
top_3 = combined_df.sort_values(by="Combined Score", ascending=False).head(3) |
|
|
|
|
|
winners_df = pd.DataFrame({ |
|
"Rank": [1, 2, 3], |
|
"Model Name": top_3["Model Name"], |
|
"Combined Score": top_3["Combined Score"].round(3) |
|
}) |
|
|
|
return winners_df |
|
|
|
|
|
def create_leaderboard(): |
|
|
|
robustness_df = pd.DataFrame(robustness_data) |
|
context_grounding_df = pd.DataFrame(context_grounding_data) |
|
|
|
|
|
robustness_df = format_table(robustness_df) |
|
context_grounding_df = format_table(context_grounding_df) |
|
|
|
|
|
winners_df = get_top_3_models(robustness_df, context_grounding_df) |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft(), title="Financial Model Performance Leaderboard") as demo: |
|
gr.Markdown("# Financial Model Performance Leaderboard") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Tab("Robustness Results"): |
|
gr.DataFrame( |
|
value=robustness_df, |
|
label="Robustness Results", |
|
wrap=True, |
|
elem_classes=["custom-table"] |
|
) |
|
with gr.Tab("Context Grounding Results"): |
|
gr.DataFrame( |
|
value=context_grounding_df, |
|
label="Context Grounding Results", |
|
wrap=True, |
|
elem_classes=["custom-table"] |
|
) |
|
with gr.Tab("Top 3 Winners"): |
|
gr.DataFrame( |
|
value=winners_df, |
|
label="Top 3 Models", |
|
wrap=True, |
|
elem_classes=["custom-table"] |
|
) |
|
with gr.Tab("About"): |
|
gr.HTML(""" |
|
<div style="padding: 20px;"> |
|
<h2>About This Leaderboard</h2> |
|
<p>This Financial Model Performance Leaderboard compares the performance of various AI models across robustness and context grounding metrics. The data is sourced from evaluations conducted on February 18, 2025, and reflects the models' ability to handle financial tasks under different conditions.</p> |
|
<p>For more information, contact us at <a href="mailto:[email protected]">[email protected]</a>.</p> |
|
</div> |
|
""") |
|
with gr.Row(): |
|
submit_btn = gr.Button("Submit Feedback") |
|
output = gr.Textbox(label="Feedback Submission Status", placeholder="Your feedback will appear here...") |
|
|
|
def submit_feedback(): |
|
return "Thank you for your feedback!" |
|
|
|
submit_btn.click(fn=submit_feedback, inputs=None, outputs=output) |
|
|
|
|
|
demo.css = """ |
|
.custom-table { |
|
font-size: 16px; /* Increase font size for readability */ |
|
line-height: 2; /* Increase line height for longer rows */ |
|
max-height: 600px; /* Set maximum height for scrolling if needed */ |
|
overflow-y: auto; /* Enable vertical scrolling if content exceeds height */ |
|
border-collapse: collapse; |
|
} |
|
.custom-table th, .custom-table td { |
|
padding: 12px; /* Increase padding for spacing */ |
|
border: 1px solid #ddd; |
|
} |
|
.custom-table th { |
|
background-color: #f5f5f5; |
|
font-weight: bold; |
|
} |
|
""" |
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
demo = create_leaderboard() |
|
demo.launch() |