|
import gradio as gr |
|
import pandas as pd |
|
import plotly.graph_objects as go |
|
import plotly.express as px |
|
from datetime import datetime |
|
import os |
|
import base64 |
|
|
|
|
|
CATEGORIES = { |
|
"Document Understanding": { |
|
"metrics": [ |
|
"Invoice ID Detection", |
|
"Date Field Recognition", |
|
"Address Block Parsing", |
|
"Table Structure Recognition" |
|
], |
|
"weight": 0.25 |
|
}, |
|
"Data Extraction": { |
|
"metrics": [ |
|
"Line Item Extraction", |
|
"Numerical Value Accuracy", |
|
"Text Field Accuracy", |
|
"Field Completeness" |
|
], |
|
"weight": 0.25 |
|
}, |
|
"Bookkeeping Intelligence": { |
|
"metrics": [ |
|
"VAT Calculation", |
|
"Total Reconciliation", |
|
"Tax Code Assignment", |
|
"Account Classification" |
|
], |
|
"weight": 0.25 |
|
}, |
|
"Error Handling": { |
|
"metrics": [ |
|
"Validation Rules", |
|
"Inconsistency Detection", |
|
"Missing Data Handling", |
|
"Format Validation" |
|
], |
|
"weight": 0.25 |
|
} |
|
} |
|
|
|
|
|
MODELS = { |
|
"Ark II": { |
|
"version": "ark-ii-v1", |
|
"type": "Text + Vision", |
|
"provider": "Jenesys AI", |
|
"inference_time": "17.94s", |
|
"scores": { |
|
"Document Understanding": { |
|
"Invoice ID": 0.733, |
|
"Date of Invoice": 0.887, |
|
"Line Items Total": 0.803, |
|
"Overall": 0.808 |
|
}, |
|
"Data Extraction": { |
|
"Supplier": 0.735, |
|
"Line Items Quantity": 0.882, |
|
"Line Items Description": 0.555, |
|
"VAT Number": 0.768, |
|
"Line Items Total": 0.803, |
|
"Overall": 0.749 |
|
}, |
|
"Bookkeeping Intelligence": { |
|
"Discount Total": 0.800, |
|
"Line Items VAT": 0.590, |
|
"VAT Exclusive": 0.694, |
|
"VAT Number": 0.768, |
|
"Discount Verification": 0.800, |
|
"Overall": 0.730 |
|
}, |
|
"Error Handling": { |
|
"Mean Accuracy": 0.718, |
|
"Overall": 0.718 |
|
} |
|
} |
|
}, |
|
"Claude-3-5-Sonnet": { |
|
"version": "claude-3-5-sonnet-20241022", |
|
"type": "Text + Vision", |
|
"provider": "Anthropic", |
|
"inference_time": "26.51s", |
|
"scores": { |
|
"Document Understanding": { |
|
"Invoice ID": 0.773, |
|
"Date of Invoice": 0.806, |
|
"Line Items Total": 0.533, |
|
"Overall": 0.704 |
|
}, |
|
"Data Extraction": { |
|
"Supplier": 0.706, |
|
"Line Items Quantity": 0.597, |
|
"Line Items Description": 0.504, |
|
"VAT Number": 0.708, |
|
"Line Items Total": 0.533, |
|
"Overall": 0.609 |
|
}, |
|
"Bookkeeping Intelligence": { |
|
"Discount Total": 0.600, |
|
"Line Items VAT": 0.524, |
|
"VAT Exclusive": 0.706, |
|
"VAT Number": 0.708, |
|
"Discount Verification": 0.600, |
|
"Overall": 0.628 |
|
}, |
|
"Error Handling": { |
|
"Mean Accuracy": 0.675, |
|
"Overall": 0.675 |
|
} |
|
} |
|
}, |
|
"GPT-4o": { |
|
"version": "gpt-4o", |
|
"type": "Text + Vision", |
|
"provider": "OpenAI", |
|
"inference_time": "19.88s", |
|
"scores": { |
|
"Document Understanding": { |
|
"Invoice ID": 0.600, |
|
"Date of Invoice": 0.917, |
|
"Line Items Total": 0.571, |
|
"Overall": 0.696 |
|
}, |
|
"Data Extraction": { |
|
"Supplier": 0.818, |
|
"Line Items Quantity": 0.722, |
|
"Line Items Description": 0.619, |
|
"VAT Number": 0.714, |
|
"Line Items Total": 0.571, |
|
"Overall": 0.689 |
|
}, |
|
"Bookkeeping Intelligence": { |
|
"Discount Total": 0.000, |
|
"Line Items VAT": 0.313, |
|
"VAT Exclusive": 0.250, |
|
"VAT Number": 0.714, |
|
"Discount Verification": 0.000, |
|
"Overall": 0.255 |
|
}, |
|
"Error Handling": { |
|
"Mean Accuracy": 0.683, |
|
"Overall": 0.683 |
|
} |
|
} |
|
}, |
|
"Ark I": { |
|
"version": "ark-i-v1", |
|
"type": "Text + Vision", |
|
"provider": "Jenesys AI", |
|
"inference_time": "7.955s", |
|
"scores": { |
|
"Document Understanding": { |
|
"Invoice ID": 0.747, |
|
"Date of Invoice": 0.905, |
|
"Line Items Total": 0.703, |
|
"Overall": 0.785 |
|
}, |
|
"Data Extraction": { |
|
"Supplier": 0.792, |
|
"Line Items Quantity": 0.811, |
|
"Line Items Description": 0.521, |
|
"VAT Number": 0.719, |
|
"Line Items Total": 0.703, |
|
"Overall": 0.709 |
|
}, |
|
"Bookkeeping Intelligence": { |
|
"Discount Total": 0.600, |
|
"Line Items VAT": 0.434, |
|
"VAT Exclusive": 0.491, |
|
"VAT Number": 0.719, |
|
"Discount Verification": 0.600, |
|
"Overall": 0.569 |
|
}, |
|
"Error Handling": { |
|
"Mean Accuracy": 0.641, |
|
"Overall": 0.641 |
|
} |
|
} |
|
} |
|
} |
|
|
|
def calculate_category_score(scores): |
|
"""Calculate average score for a category's metrics.""" |
|
|
|
metrics = {k: v for k, v in scores.items() if k != 'Overall'} |
|
return sum(metrics.values()) / len(metrics) |
|
|
|
def calculate_overall_score(model_data): |
|
"""Calculate the weighted average score across all categories.""" |
|
category_scores = {} |
|
for category, metrics in model_data["scores"].items(): |
|
|
|
category_metrics = {k: v for k, v in metrics.items() if k != 'Overall'} |
|
category_scores[category] = sum(category_metrics.values()) / len(category_metrics) * CATEGORIES[category]["weight"] |
|
return sum(category_scores.values()) |
|
|
|
def create_leaderboard_df(): |
|
"""Create a DataFrame for the leaderboard with detailed metrics.""" |
|
data = [] |
|
for model_name, model_info in MODELS.items(): |
|
|
|
category_scores = { |
|
category: calculate_category_score(metrics) |
|
for category, metrics in model_info["scores"].items() |
|
} |
|
|
|
|
|
error_handling_score = calculate_category_score(model_info["scores"]["Error Handling"]) |
|
|
|
row = { |
|
"Model": model_name, |
|
"Version": model_info["version"], |
|
"Type": model_info["type"], |
|
"Provider": model_info["provider"], |
|
"Average Score": error_handling_score, |
|
**category_scores |
|
} |
|
data.append(row) |
|
|
|
df = pd.DataFrame(data) |
|
return df.sort_values("Average Score", ascending=False) |
|
|
|
|
|
def create_category_comparison(): |
|
"""Create a bar chart comparing all models across categories.""" |
|
df = create_leaderboard_df() |
|
df_melted = df.melt( |
|
id_vars=["Model"], |
|
value_vars=list(CATEGORIES.keys()), |
|
var_name="Category", |
|
value_name="Score" |
|
) |
|
|
|
fig = px.bar( |
|
df_melted, |
|
x="Category", |
|
y="Score", |
|
color="Model", |
|
barmode="group", |
|
title="Model Performance by Category", |
|
range_y=[0, 1.0] |
|
) |
|
|
|
fig.update_layout( |
|
xaxis_title="Category", |
|
yaxis_title="Score", |
|
legend_title="Model", |
|
font=dict(size=14), |
|
title=dict( |
|
text="Model Performance by Category", |
|
x=0.5, |
|
y=0.95, |
|
xanchor='center', |
|
yanchor='top', |
|
font=dict(size=20) |
|
), |
|
yaxis=dict( |
|
tickmode='array', |
|
ticktext=['0%', '20%', '40%', '60%', '80%', '100%'], |
|
tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1.0], |
|
gridcolor='rgba(0, 0, 0, 0.1)', |
|
zeroline=True, |
|
zerolinecolor='rgba(0, 0, 0, 0.2)', |
|
zerolinewidth=1 |
|
), |
|
xaxis=dict( |
|
tickangle=-45, |
|
gridcolor='rgba(0, 0, 0, 0.1)' |
|
), |
|
bargap=0.2, |
|
bargroupgap=0.1, |
|
paper_bgcolor='rgba(255, 255, 255, 0.9)', |
|
plot_bgcolor='rgba(255, 255, 255, 0.9)', |
|
margin=dict(t=100, b=100, l=100, r=20), |
|
showlegend=True, |
|
legend=dict( |
|
yanchor="top", |
|
y=1, |
|
xanchor="left", |
|
x=1.02, |
|
bgcolor='rgba(255, 255, 255, 0.9)', |
|
bordercolor='rgba(0, 0, 0, 0.1)', |
|
borderwidth=1 |
|
) |
|
) |
|
|
|
return fig |
|
|
|
|
|
|
|
def create_combined_radar_chart(): |
|
"""Create a radar chart showing all models together.""" |
|
try: |
|
import plotly.graph_objects as go |
|
|
|
categories = list(CATEGORIES.keys()) |
|
|
|
|
|
colors = { |
|
"Ark II": "rgb(99, 110, 250)", |
|
"Claude-3-5-Sonnet": "rgb(239, 85, 59)", |
|
"GPT-4o": "rgb(0, 204, 150)", |
|
"Ark I": "rgb(171, 99, 250)" |
|
} |
|
|
|
fig = go.Figure() |
|
|
|
|
|
for model_name, color in colors.items(): |
|
model_data = MODELS[model_name] |
|
values = [] |
|
|
|
for category in categories: |
|
metrics = {k: v for k, v in model_data["scores"][category].items() if k != 'Overall'} |
|
if category == "Error Handling": |
|
values.append(metrics.get("Mean Accuracy", 0.0)) |
|
else: |
|
values.append(sum(metrics.values()) / len(metrics) if metrics else 0.0) |
|
|
|
fig.add_trace(go.Scatterpolar( |
|
r=values + [values[0]], |
|
theta=categories + [categories[0]], |
|
fill='none', |
|
line=dict(color=color, width=2), |
|
name=model_name |
|
)) |
|
|
|
|
|
fig.update_layout( |
|
polar=dict( |
|
radialaxis=dict( |
|
visible=True, |
|
range=[0, 1.0], |
|
tickmode='array', |
|
ticktext=['0%', '20%', '40%', '60%', '80%', '100%'], |
|
tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1.0], |
|
gridcolor='rgba(0, 0, 0, 0.1)', |
|
linecolor='rgba(0, 0, 0, 0.1)' |
|
), |
|
angularaxis=dict( |
|
gridcolor='rgba(0, 0, 0, 0.1)', |
|
linecolor='rgba(0, 0, 0, 0.1)' |
|
), |
|
bgcolor='rgba(255, 255, 255, 0.9)' |
|
), |
|
showlegend=True, |
|
paper_bgcolor='rgba(255, 255, 255, 0.9)', |
|
plot_bgcolor='rgba(255, 255, 255, 0.9)', |
|
title=dict( |
|
text="Model Performance Comparison", |
|
x=0.5, |
|
y=0.95, |
|
xanchor='center', |
|
yanchor='top', |
|
font=dict(size=20) |
|
), |
|
legend=dict( |
|
yanchor="top", |
|
y=1, |
|
xanchor="left", |
|
x=1.02 |
|
), |
|
margin=dict(t=100, b=100, l=100, r=100) |
|
) |
|
|
|
return fig |
|
except Exception as e: |
|
print(f"Error creating radar chart: {str(e)}") |
|
return go.Figure() |
|
|
|
def create_comparison_metrics_df(model_name): |
|
"""Create a DataFrame showing detailed metrics with comparisons.""" |
|
base_model = "Ark II" |
|
data = [] |
|
|
|
base_data = MODELS[base_model]["scores"] |
|
compare_data = MODELS[model_name]["scores"] |
|
|
|
for category in CATEGORIES.keys(): |
|
base_metrics = {k: v for k, v in base_data[category].items() if k != 'Overall'} |
|
compare_metrics = {k: v for k, v in compare_data[category].items() if k != 'Overall'} |
|
|
|
for metric in base_metrics.keys(): |
|
if metric in compare_metrics: |
|
base_value = base_metrics[metric] |
|
compare_value = compare_metrics[metric] |
|
diff = compare_value - base_value |
|
|
|
data.append({ |
|
"Category": category, |
|
"Metric": metric, |
|
f"{model_name} Score": compare_value, |
|
f"{base_model} Score": base_value, |
|
"Difference": diff, |
|
"Better/Worse": "β" if diff > 0 else "β" if diff < 0 else "=" |
|
}) |
|
|
|
df = pd.DataFrame(data) |
|
return df |
|
|
|
def update_model_details(model_name): |
|
"""Update the detailed metrics view for a selected model.""" |
|
try: |
|
df = create_comparison_metrics_df(model_name) |
|
return [df, create_combined_radar_chart()] |
|
except Exception as e: |
|
print(f"Error in update_model_details: {str(e)}") |
|
return [pd.DataFrame(), go.Figure()] |
|
|
|
|
|
def get_logo_html(): |
|
logo_path = os.path.join(os.path.dirname(__file__), "jenesys.jpg") |
|
with open(logo_path, "rb") as f: |
|
encoded_logo = base64.b64encode(f.read()).decode() |
|
return f'<img src="data:image/jpeg;base64,{encoded_logo}" style="height: 50px; margin-right: 10px;">' |
|
|
|
|
|
with gr.Blocks(title="AI Bookkeeper Leaderboard") as demo: |
|
gr.Markdown(f""" |
|
<div style="display: flex; align-items: center; margin-bottom: 1rem;"> |
|
{get_logo_html()} |
|
<h1 style="margin: 0;">AI Bookkeeper Leaderboard</h1> |
|
</div> |
|
""") |
|
|
|
gr.Markdown(f"Last updated: {datetime.now().strftime('%Y-%m-%d')}") |
|
|
|
gr.Markdown(""" |
|
## About the Benchmark π |
|
|
|
This benchmark evaluates Large Vision Language Models on their ability to process and understand bookkeeping documents across four main categories: |
|
|
|
1. **Document Understanding (25%)**: Ability to parse and understand document structure |
|
2. **Data Extraction (25%)**: Accuracy in extracting specific data points |
|
3. **Bookkeeping Intelligence (25%)**: Understanding of bookkeeping concepts, calculations and general ledger accounting |
|
4. **Error Handling (25%)**: Ability to detect and handle inconsistencies |
|
|
|
Each metric is scored from 0 to 1, where: |
|
- 0.90-1.00 = Excellent |
|
- 0.80-0.89 = Good |
|
- 0.70-0.79 = Acceptable |
|
- < 0.70 = Needs improvement |
|
|
|
""") |
|
|
|
with gr.Row(): |
|
leaderboard = gr.DataFrame( |
|
create_leaderboard_df(), |
|
label="Overall Leaderboard", |
|
height=200 |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1, min_width=1200): |
|
category_plot = gr.Plot( |
|
value=create_category_comparison() |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
model_selector = gr.Dropdown( |
|
choices=[m for m in list(MODELS.keys()) if m != "Ark II"], |
|
label="Select Model to Compare with Ark II", |
|
value="Claude-3-5-Sonnet", |
|
interactive=True |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
metrics_table = gr.DataFrame( |
|
create_comparison_metrics_df("Claude-3-5-Sonnet"), |
|
label="Comparison Metrics (vs Ark II)", |
|
height=400 |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1, min_width=1200): |
|
radar_chart = gr.Plot(value=create_combined_radar_chart()) |
|
|
|
|
|
model_selector.change( |
|
fn=update_model_details, |
|
inputs=[model_selector], |
|
outputs=[metrics_table, radar_chart] |
|
) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch(share=True) |