Spaces:

jenesys-ai
/

ai_bookkeeper_leaderboard

Running

File size: 16,198 Bytes

b262a9b

import gradio as gr
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
import os
import base64

# Define the benchmark categories and their component metrics
CATEGORIES = {
    "Document Understanding": {
        "metrics": [
            "Invoice ID Detection",
            "Date Field Recognition",
            "Address Block Parsing",
            "Table Structure Recognition"
        ],
        "weight": 0.25
    },
    "Data Extraction": {
        "metrics": [
            "Line Item Extraction",
            "Numerical Value Accuracy",
            "Text Field Accuracy",
            "Field Completeness"
        ],
        "weight": 0.25
    },
    "Bookkeeping Intelligence": {
        "metrics": [
            "VAT Calculation",
            "Total Reconciliation",
            "Tax Code Assignment",
            "Account Classification"
        ],
        "weight": 0.25
    },
    "Error Handling": {
        "metrics": [
            "Validation Rules",
            "Inconsistency Detection",
            "Missing Data Handling",
            "Format Validation"
        ],
        "weight": 0.25
    }
}

# Updated benchmark data with real metrics
MODELS = {
    "Ark II": {
        "version": "ark-ii-v1",
        "type": "Text + Vision",
        "provider": "Jenesys AI",
        "inference_time": "17.94s",
        "scores": {
            "Document Understanding": {
                "Invoice ID": 0.733,
                "Date of Invoice": 0.887,
                "Line Items Total": 0.803,
                "Overall": 0.808
            },
            "Data Extraction": {
                "Supplier": 0.735,
                "Line Items Quantity": 0.882,
                "Line Items Description": 0.555,
                "VAT Number": 0.768,
                "Line Items Total": 0.803,
                "Overall": 0.749
            },
            "Bookkeeping Intelligence": {
                "Discount Total": 0.800,
                "Line Items VAT": 0.590,
                "VAT Exclusive": 0.694,
                "VAT Number": 0.768,
                "Discount Verification": 0.800,
                "Overall": 0.730
            },
            "Error Handling": {
                "Mean Accuracy": 0.718,
                "Overall": 0.718
            }
        }
    },
    "Claude-3-5-Sonnet": {
        "version": "claude-3-5-sonnet-20241022",
        "type": "Text + Vision",
        "provider": "Anthropic",
        "inference_time": "26.51s",
        "scores": {
            "Document Understanding": {
                "Invoice ID": 0.773,
                "Date of Invoice": 0.806,
                "Line Items Total": 0.533,
                "Overall": 0.704
            },
            "Data Extraction": {
                "Supplier": 0.706,
                "Line Items Quantity": 0.597,
                "Line Items Description": 0.504,
                "VAT Number": 0.708,
                "Line Items Total": 0.533,
                "Overall": 0.609
            },
            "Bookkeeping Intelligence": {
                "Discount Total": 0.600,
                "Line Items VAT": 0.524,
                "VAT Exclusive": 0.706,
                "VAT Number": 0.708,
                "Discount Verification": 0.600,
                "Overall": 0.628
            },
            "Error Handling": {
                "Mean Accuracy": 0.675,
                "Overall": 0.675
            }
        }
    },
    "GPT-4o": {
        "version": "gpt-4o",
        "type": "Text + Vision",
        "provider": "OpenAI",
        "inference_time": "19.88s",
        "scores": {
            "Document Understanding": {
                "Invoice ID": 0.600,
                "Date of Invoice": 0.917,
                "Line Items Total": 0.571,
                "Overall": 0.696
            },
            "Data Extraction": {
                "Supplier": 0.818,
                "Line Items Quantity": 0.722,
                "Line Items Description": 0.619,
                "VAT Number": 0.714,
                "Line Items Total": 0.571,
                "Overall": 0.689
            },
            "Bookkeeping Intelligence": {
                "Discount Total": 0.000,
                "Line Items VAT": 0.313,
                "VAT Exclusive": 0.250,
                "VAT Number": 0.714,
                "Discount Verification": 0.000,
                "Overall": 0.255
            },
            "Error Handling": {
                "Mean Accuracy": 0.683,
                "Overall": 0.683
            }
        }
    },
    "Ark I": {
        "version": "ark-i-v1",
        "type": "Text + Vision",
        "provider": "Jenesys AI",
        "inference_time": "7.955s",
        "scores": {
            "Document Understanding": {
                "Invoice ID": 0.747,
                "Date of Invoice": 0.905,
                "Line Items Total": 0.703,
                "Overall": 0.785
            },
            "Data Extraction": {
                "Supplier": 0.792,
                "Line Items Quantity": 0.811,
                "Line Items Description": 0.521,
                "VAT Number": 0.719,
                "Line Items Total": 0.703,
                "Overall": 0.709
            },
            "Bookkeeping Intelligence": {
                "Discount Total": 0.600,
                "Line Items VAT": 0.434,
                "VAT Exclusive": 0.491,
                "VAT Number": 0.719,
                "Discount Verification": 0.600,
                "Overall": 0.569
            },
            "Error Handling": {
                "Mean Accuracy": 0.641,
                "Overall": 0.641
            }
        }
    }
}

def calculate_category_score(scores):
    """Calculate average score for a category's metrics."""
    # Skip 'Overall' when calculating average
    metrics = {k: v for k, v in scores.items() if k != 'Overall'}
    return sum(metrics.values()) / len(metrics)

def calculate_overall_score(model_data):
    """Calculate the weighted average score across all categories."""
    category_scores = {}
    for category, metrics in model_data["scores"].items():
        # Skip 'Overall' when calculating
        category_metrics = {k: v for k, v in metrics.items() if k != 'Overall'}
        category_scores[category] = sum(category_metrics.values()) / len(category_metrics) * CATEGORIES[category]["weight"]
    return sum(category_scores.values())

def create_leaderboard_df():
    """Create a DataFrame for the leaderboard with detailed metrics."""
    data = []
    for model_name, model_info in MODELS.items():
        # Calculate category scores
        category_scores = {
            category: calculate_category_score(metrics)
            for category, metrics in model_info["scores"].items()
        }
        
        # Use Error Handling score as Average Score
        error_handling_score = calculate_category_score(model_info["scores"]["Error Handling"])
        
        row = {
            "Model": model_name,
            "Version": model_info["version"],
            "Type": model_info["type"],
            "Provider": model_info["provider"],
            "Average Score": error_handling_score,  # Using Error Handling score
            **category_scores
        }
        data.append(row)
    
    df = pd.DataFrame(data)
    return df.sort_values("Average Score", ascending=False)


def create_category_comparison():
    """Create a bar chart comparing all models across categories."""
    df = create_leaderboard_df()
    df_melted = df.melt(
        id_vars=["Model"],
        value_vars=list(CATEGORIES.keys()),
        var_name="Category",
        value_name="Score"
    )
    
    fig = px.bar(
        df_melted,
        x="Category",
        y="Score",
        color="Model",
        barmode="group",
        title="Model Performance by Category",
        range_y=[0, 1.0]
    )
    
    fig.update_layout(
        xaxis_title="Category",
        yaxis_title="Score",
        legend_title="Model",
        font=dict(size=14),
        title=dict(
            text="Model Performance by Category",
            x=0.5,
            y=0.95,
            xanchor='center',
            yanchor='top',
            font=dict(size=20)
        ),
        yaxis=dict(
            tickmode='array',
            ticktext=['0%', '20%', '40%', '60%', '80%', '100%'],
            tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1.0],
            gridcolor='rgba(0, 0, 0, 0.1)',
            zeroline=True,
            zerolinecolor='rgba(0, 0, 0, 0.2)',
            zerolinewidth=1
        ),
        xaxis=dict(
            tickangle=-45,
            gridcolor='rgba(0, 0, 0, 0.1)'
        ),
        bargap=0.2,
        bargroupgap=0.1,
        paper_bgcolor='rgba(255, 255, 255, 0.9)',
        plot_bgcolor='rgba(255, 255, 255, 0.9)',
        margin=dict(t=100, b=100, l=100, r=20),
        showlegend=True,
        legend=dict(
            yanchor="top",
            y=1,
            xanchor="left",
            x=1.02,
            bgcolor='rgba(255, 255, 255, 0.9)',
            bordercolor='rgba(0, 0, 0, 0.1)',
            borderwidth=1
        )
    )
    
    return fig



def create_combined_radar_chart():
    """Create a radar chart showing all models together."""
    try:
        import plotly.graph_objects as go
        
        categories = list(CATEGORIES.keys())
        
        # Define colors for each model
        colors = {
            "Ark II": "rgb(99, 110, 250)",      # Blue
            "Claude-3-5-Sonnet": "rgb(239, 85, 59)", # Red
            "GPT-4o": "rgb(0, 204, 150)",           # Green
            "Ark I": "rgb(171, 99, 250)"        # Purple
        }
        
        fig = go.Figure()
        
        # Add trace for each model
        for model_name, color in colors.items():
            model_data = MODELS[model_name]
            values = []
            
            for category in categories:
                metrics = {k: v for k, v in model_data["scores"][category].items() if k != 'Overall'}
                if category == "Error Handling":
                    values.append(metrics.get("Mean Accuracy", 0.0))
                else:
                    values.append(sum(metrics.values()) / len(metrics) if metrics else 0.0)
            
            fig.add_trace(go.Scatterpolar(
                r=values + [values[0]],
                theta=categories + [categories[0]],
                fill='none',
                line=dict(color=color, width=2),
                name=model_name
            ))
        
        # Update layout
        fig.update_layout(
            polar=dict(
                radialaxis=dict(
                    visible=True,
                    range=[0, 1.0],
                    tickmode='array',
                    ticktext=['0%', '20%', '40%', '60%', '80%', '100%'],
                    tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1.0],
                    gridcolor='rgba(0, 0, 0, 0.1)',
                    linecolor='rgba(0, 0, 0, 0.1)'
                ),
                angularaxis=dict(
                    gridcolor='rgba(0, 0, 0, 0.1)',
                    linecolor='rgba(0, 0, 0, 0.1)'
                ),
                bgcolor='rgba(255, 255, 255, 0.9)'
            ),
            showlegend=True,
            paper_bgcolor='rgba(255, 255, 255, 0.9)',
            plot_bgcolor='rgba(255, 255, 255, 0.9)',
            title=dict(
                text="Model Performance Comparison",
                x=0.5,
                y=0.95,
                xanchor='center',
                yanchor='top',
                font=dict(size=20)
            ),
            legend=dict(
                yanchor="top",
                y=1,
                xanchor="left",
                x=1.02
            ),
            margin=dict(t=100, b=100, l=100, r=100)
        )
        
        return fig
    except Exception as e:
        print(f"Error creating radar chart: {str(e)}")
        return go.Figure()

def create_comparison_metrics_df(model_name):
    """Create a DataFrame showing detailed metrics with comparisons."""
    base_model = "Ark II"
    data = []
    
    base_data = MODELS[base_model]["scores"]
    compare_data = MODELS[model_name]["scores"]
    
    for category in CATEGORIES.keys():
        base_metrics = {k: v for k, v in base_data[category].items() if k != 'Overall'}
        compare_metrics = {k: v for k, v in compare_data[category].items() if k != 'Overall'}
        
        for metric in base_metrics.keys():
            if metric in compare_metrics:
                base_value = base_metrics[metric]
                compare_value = compare_metrics[metric]
                diff = compare_value - base_value
                
                data.append({
                    "Category": category,
                    "Metric": metric,
                    f"{model_name} Score": compare_value,
                    f"{base_model} Score": base_value,
                    "Difference": diff,
                    "Better/Worse": "↑" if diff > 0 else "↓" if diff < 0 else "="
                })
    
    df = pd.DataFrame(data)
    return df

def update_model_details(model_name):
    """Update the detailed metrics view for a selected model."""
    try:
        df = create_comparison_metrics_df(model_name)
        return [df, create_combined_radar_chart()]
    except Exception as e:
        print(f"Error in update_model_details: {str(e)}")
        return [pd.DataFrame(), go.Figure()]

# Load logo as base64
def get_logo_html():
    logo_path = os.path.join(os.path.dirname(__file__), "jenesys.jpg")
    with open(logo_path, "rb") as f:
        encoded_logo = base64.b64encode(f.read()).decode()
    return f'<img src="data:image/jpeg;base64,{encoded_logo}" style="height: 50px; margin-right: 10px;">'

# Create the Gradio interface
with gr.Blocks(title="AI Bookkeeper Leaderboard") as demo:
    gr.Markdown(f"""
    <div style="display: flex; align-items: center; margin-bottom: 1rem;">
        {get_logo_html()}
        <h1 style="margin: 0;">AI Bookkeeper Leaderboard</h1>
    </div>
    """)

    gr.Markdown(f"Last updated: {datetime.now().strftime('%Y-%m-%d')}")

    gr.Markdown("""
    ## About the Benchmark 📊
    
    This benchmark evaluates Large Vision Language Models on their ability to process and understand bookkeeping documents across four main categories:
    
    1. **Document Understanding (25%)**: Ability to parse and understand document structure
    2. **Data Extraction (25%)**: Accuracy in extracting specific data points
    3. **Bookkeeping Intelligence (25%)**: Understanding of bookkeeping concepts, calculations and general ledger accounting
    4. **Error Handling (25%)**: Ability to detect and handle inconsistencies
    
    Each metric is scored from 0 to 1, where:
    - 0.90-1.00 = Excellent
    - 0.80-0.89 = Good
    - 0.70-0.79 = Acceptable
    - < 0.70 = Needs improvement
    
    """)
    
    with gr.Row():
        leaderboard = gr.DataFrame(
            create_leaderboard_df(),
            label="Overall Leaderboard",
            height=200
        )
    
    with gr.Row():
        with gr.Column(scale=1, min_width=1200):
            category_plot = gr.Plot(
                value=create_category_comparison()
            )
    
    with gr.Row():
        with gr.Column(scale=1):
            model_selector = gr.Dropdown(
                choices=[m for m in list(MODELS.keys()) if m != "Ark II"],
                label="Select Model to Compare with Ark II",
                value="Claude-3-5-Sonnet",
                interactive=True
            )
    
    with gr.Row():
        with gr.Column(scale=2):
            metrics_table = gr.DataFrame(
                create_comparison_metrics_df("Claude-3-5-Sonnet"),
                label="Comparison Metrics (vs Ark II)",
                height=400
            )
    
    with gr.Row():
        with gr.Column(scale=1, min_width=1200):
            radar_chart = gr.Plot(value=create_combined_radar_chart())
    
    # Update callback
    model_selector.change(
        fn=update_model_details,
        inputs=[model_selector],
        outputs=[metrics_table, radar_chart]
    )
    
    

if __name__ == "__main__":
    demo.launch(share=True)