Spaces:

jenesys-ai
/

ai_bookkeeper_leaderboard

Running

App Files Files Community

ai_bookkeeper_leaderboard / app.py

tosi-n

Upload folder using huggingface_hub

b262a9b verified 6 months ago

raw

history blame contribute delete

16.2 kB

	import gradio as gr
	import pandas as pd
	import plotly.graph_objects as go
	import plotly.express as px
	from datetime import datetime
	import os
	import base64

	# Define the benchmark categories and their component metrics
	CATEGORIES = {
	"Document Understanding": {
	"metrics": [
	"Invoice ID Detection",
	"Date Field Recognition",
	"Address Block Parsing",
	"Table Structure Recognition"
	],
	"weight": 0.25
	},
	"Data Extraction": {
	"metrics": [
	"Line Item Extraction",
	"Numerical Value Accuracy",
	"Text Field Accuracy",
	"Field Completeness"
	],
	"weight": 0.25
	},
	"Bookkeeping Intelligence": {
	"metrics": [
	"VAT Calculation",
	"Total Reconciliation",
	"Tax Code Assignment",
	"Account Classification"
	],
	"weight": 0.25
	},
	"Error Handling": {
	"metrics": [
	"Validation Rules",
	"Inconsistency Detection",
	"Missing Data Handling",
	"Format Validation"
	],
	"weight": 0.25
	}
	}

	# Updated benchmark data with real metrics
	MODELS = {
	"Ark II": {
	"version": "ark-ii-v1",
	"type": "Text + Vision",
	"provider": "Jenesys AI",
	"inference_time": "17.94s",
	"scores": {
	"Document Understanding": {
	"Invoice ID": 0.733,
	"Date of Invoice": 0.887,
	"Line Items Total": 0.803,
	"Overall": 0.808
	},
	"Data Extraction": {
	"Supplier": 0.735,
	"Line Items Quantity": 0.882,
	"Line Items Description": 0.555,
	"VAT Number": 0.768,
	"Line Items Total": 0.803,
	"Overall": 0.749
	},
	"Bookkeeping Intelligence": {
	"Discount Total": 0.800,
	"Line Items VAT": 0.590,
	"VAT Exclusive": 0.694,
	"VAT Number": 0.768,
	"Discount Verification": 0.800,
	"Overall": 0.730
	},
	"Error Handling": {
	"Mean Accuracy": 0.718,
	"Overall": 0.718
	}
	}
	},
	"Claude-3-5-Sonnet": {
	"version": "claude-3-5-sonnet-20241022",
	"type": "Text + Vision",
	"provider": "Anthropic",
	"inference_time": "26.51s",
	"scores": {
	"Document Understanding": {
	"Invoice ID": 0.773,
	"Date of Invoice": 0.806,
	"Line Items Total": 0.533,
	"Overall": 0.704
	},
	"Data Extraction": {
	"Supplier": 0.706,
	"Line Items Quantity": 0.597,
	"Line Items Description": 0.504,
	"VAT Number": 0.708,
	"Line Items Total": 0.533,
	"Overall": 0.609
	},
	"Bookkeeping Intelligence": {
	"Discount Total": 0.600,
	"Line Items VAT": 0.524,
	"VAT Exclusive": 0.706,
	"VAT Number": 0.708,
	"Discount Verification": 0.600,
	"Overall": 0.628
	},
	"Error Handling": {
	"Mean Accuracy": 0.675,
	"Overall": 0.675
	}
	}
	},
	"GPT-4o": {
	"version": "gpt-4o",
	"type": "Text + Vision",
	"provider": "OpenAI",
	"inference_time": "19.88s",
	"scores": {
	"Document Understanding": {
	"Invoice ID": 0.600,
	"Date of Invoice": 0.917,
	"Line Items Total": 0.571,
	"Overall": 0.696
	},
	"Data Extraction": {
	"Supplier": 0.818,
	"Line Items Quantity": 0.722,
	"Line Items Description": 0.619,
	"VAT Number": 0.714,
	"Line Items Total": 0.571,
	"Overall": 0.689
	},
	"Bookkeeping Intelligence": {
	"Discount Total": 0.000,
	"Line Items VAT": 0.313,
	"VAT Exclusive": 0.250,
	"VAT Number": 0.714,
	"Discount Verification": 0.000,
	"Overall": 0.255
	},
	"Error Handling": {
	"Mean Accuracy": 0.683,
	"Overall": 0.683
	}
	}
	},
	"Ark I": {
	"version": "ark-i-v1",
	"type": "Text + Vision",
	"provider": "Jenesys AI",
	"inference_time": "7.955s",
	"scores": {
	"Document Understanding": {
	"Invoice ID": 0.747,
	"Date of Invoice": 0.905,
	"Line Items Total": 0.703,
	"Overall": 0.785
	},
	"Data Extraction": {
	"Supplier": 0.792,
	"Line Items Quantity": 0.811,
	"Line Items Description": 0.521,
	"VAT Number": 0.719,
	"Line Items Total": 0.703,
	"Overall": 0.709
	},
	"Bookkeeping Intelligence": {
	"Discount Total": 0.600,
	"Line Items VAT": 0.434,
	"VAT Exclusive": 0.491,
	"VAT Number": 0.719,
	"Discount Verification": 0.600,
	"Overall": 0.569
	},
	"Error Handling": {
	"Mean Accuracy": 0.641,
	"Overall": 0.641
	}
	}
	}
	}

	def calculate_category_score(scores):
	"""Calculate average score for a category's metrics."""
	# Skip 'Overall' when calculating average
	metrics = {k: v for k, v in scores.items() if k != 'Overall'}
	return sum(metrics.values()) / len(metrics)

	def calculate_overall_score(model_data):
	"""Calculate the weighted average score across all categories."""
	category_scores = {}
	for category, metrics in model_data["scores"].items():
	# Skip 'Overall' when calculating
	category_metrics = {k: v for k, v in metrics.items() if k != 'Overall'}
	category_scores[category] = sum(category_metrics.values()) / len(category_metrics) * CATEGORIES[category]["weight"]
	return sum(category_scores.values())

	def create_leaderboard_df():
	"""Create a DataFrame for the leaderboard with detailed metrics."""
	data = []
	for model_name, model_info in MODELS.items():
	# Calculate category scores
	category_scores = {
	category: calculate_category_score(metrics)
	for category, metrics in model_info["scores"].items()
	}

	# Use Error Handling score as Average Score
	error_handling_score = calculate_category_score(model_info["scores"]["Error Handling"])

	row = {
	"Model": model_name,
	"Version": model_info["version"],
	"Type": model_info["type"],
	"Provider": model_info["provider"],
	"Average Score": error_handling_score, # Using Error Handling score
	**category_scores
	}
	data.append(row)

	df = pd.DataFrame(data)
	return df.sort_values("Average Score", ascending=False)


	def create_category_comparison():
	"""Create a bar chart comparing all models across categories."""
	df = create_leaderboard_df()
	df_melted = df.melt(
	id_vars=["Model"],
	value_vars=list(CATEGORIES.keys()),
	var_name="Category",
	value_name="Score"
	)

	fig = px.bar(
	df_melted,
	x="Category",
	y="Score",
	color="Model",
	barmode="group",
	title="Model Performance by Category",
	range_y=[0, 1.0]
	)

	fig.update_layout(
	xaxis_title="Category",
	yaxis_title="Score",
	legend_title="Model",
	font=dict(size=14),
	title=dict(
	text="Model Performance by Category",
	x=0.5,
	y=0.95,
	xanchor='center',
	yanchor='top',
	font=dict(size=20)
	),
	yaxis=dict(
	tickmode='array',
	ticktext=['0%', '20%', '40%', '60%', '80%', '100%'],
	tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1.0],
	gridcolor='rgba(0, 0, 0, 0.1)',
	zeroline=True,
	zerolinecolor='rgba(0, 0, 0, 0.2)',
	zerolinewidth=1
	),
	xaxis=dict(
	tickangle=-45,
	gridcolor='rgba(0, 0, 0, 0.1)'
	),
	bargap=0.2,
	bargroupgap=0.1,
	paper_bgcolor='rgba(255, 255, 255, 0.9)',
	plot_bgcolor='rgba(255, 255, 255, 0.9)',
	margin=dict(t=100, b=100, l=100, r=20),
	showlegend=True,
	legend=dict(
	yanchor="top",
	y=1,
	xanchor="left",
	x=1.02,
	bgcolor='rgba(255, 255, 255, 0.9)',
	bordercolor='rgba(0, 0, 0, 0.1)',
	borderwidth=1
	)
	)

	return fig



	def create_combined_radar_chart():
	"""Create a radar chart showing all models together."""
	try:
	import plotly.graph_objects as go

	categories = list(CATEGORIES.keys())

	# Define colors for each model
	colors = {
	"Ark II": "rgb(99, 110, 250)", # Blue
	"Claude-3-5-Sonnet": "rgb(239, 85, 59)", # Red
	"GPT-4o": "rgb(0, 204, 150)", # Green
	"Ark I": "rgb(171, 99, 250)" # Purple
	}

	fig = go.Figure()

	# Add trace for each model
	for model_name, color in colors.items():
	model_data = MODELS[model_name]
	values = []

	for category in categories:
	metrics = {k: v for k, v in model_data["scores"][category].items() if k != 'Overall'}
	if category == "Error Handling":
	values.append(metrics.get("Mean Accuracy", 0.0))
	else:
	values.append(sum(metrics.values()) / len(metrics) if metrics else 0.0)

	fig.add_trace(go.Scatterpolar(
	r=values + [values[0]],
	theta=categories + [categories[0]],
	fill='none',
	line=dict(color=color, width=2),
	name=model_name
	))

	# Update layout
	fig.update_layout(
	polar=dict(
	radialaxis=dict(
	visible=True,
	range=[0, 1.0],
	tickmode='array',
	ticktext=['0%', '20%', '40%', '60%', '80%', '100%'],
	tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1.0],
	gridcolor='rgba(0, 0, 0, 0.1)',
	linecolor='rgba(0, 0, 0, 0.1)'
	),
	angularaxis=dict(
	gridcolor='rgba(0, 0, 0, 0.1)',
	linecolor='rgba(0, 0, 0, 0.1)'
	),
	bgcolor='rgba(255, 255, 255, 0.9)'
	),
	showlegend=True,
	paper_bgcolor='rgba(255, 255, 255, 0.9)',
	plot_bgcolor='rgba(255, 255, 255, 0.9)',
	title=dict(
	text="Model Performance Comparison",
	x=0.5,
	y=0.95,
	xanchor='center',
	yanchor='top',
	font=dict(size=20)
	),
	legend=dict(
	yanchor="top",
	y=1,
	xanchor="left",
	x=1.02
	),
	margin=dict(t=100, b=100, l=100, r=100)
	)

	return fig
	except Exception as e:
	print(f"Error creating radar chart: {str(e)}")
	return go.Figure()

	def create_comparison_metrics_df(model_name):
	"""Create a DataFrame showing detailed metrics with comparisons."""
	base_model = "Ark II"
	data = []

	base_data = MODELS[base_model]["scores"]
	compare_data = MODELS[model_name]["scores"]

	for category in CATEGORIES.keys():
	base_metrics = {k: v for k, v in base_data[category].items() if k != 'Overall'}
	compare_metrics = {k: v for k, v in compare_data[category].items() if k != 'Overall'}

	for metric in base_metrics.keys():
	if metric in compare_metrics:
	base_value = base_metrics[metric]
	compare_value = compare_metrics[metric]
	diff = compare_value - base_value

	data.append({
	"Category": category,
	"Metric": metric,
	f"{model_name} Score": compare_value,
	f"{base_model} Score": base_value,
	"Difference": diff,
	"Better/Worse": "↑" if diff > 0 else "↓" if diff < 0 else "="
	})

	df = pd.DataFrame(data)
	return df

	def update_model_details(model_name):
	"""Update the detailed metrics view for a selected model."""
	try:
	df = create_comparison_metrics_df(model_name)
	return [df, create_combined_radar_chart()]
	except Exception as e:
	print(f"Error in update_model_details: {str(e)}")
	return [pd.DataFrame(), go.Figure()]

	# Load logo as base64
	def get_logo_html():
	logo_path = os.path.join(os.path.dirname(__file__), "jenesys.jpg")
	with open(logo_path, "rb") as f:
	encoded_logo = base64.b64encode(f.read()).decode()
	return f'<img src="data:image/jpeg;base64,{encoded_logo}" style="height: 50px; margin-right: 10px;">'

	# Create the Gradio interface
	with gr.Blocks(title="AI Bookkeeper Leaderboard") as demo:
	gr.Markdown(f"""
	<div style="display: flex; align-items: center; margin-bottom: 1rem;">
	{get_logo_html()}
	<h1 style="margin: 0;">AI Bookkeeper Leaderboard</h1>
	</div>
	""")

	gr.Markdown(f"Last updated: {datetime.now().strftime('%Y-%m-%d')}")

	gr.Markdown("""
	## About the Benchmark 📊

	This benchmark evaluates Large Vision Language Models on their ability to process and understand bookkeeping documents across four main categories:

	1. Document Understanding (25%): Ability to parse and understand document structure
	2. Data Extraction (25%): Accuracy in extracting specific data points
	3. Bookkeeping Intelligence (25%): Understanding of bookkeeping concepts, calculations and general ledger accounting
	4. Error Handling (25%): Ability to detect and handle inconsistencies

	Each metric is scored from 0 to 1, where:
	- 0.90-1.00 = Excellent
	- 0.80-0.89 = Good
	- 0.70-0.79 = Acceptable
	- < 0.70 = Needs improvement

	""")

	with gr.Row():
	leaderboard = gr.DataFrame(
	create_leaderboard_df(),
	label="Overall Leaderboard",
	height=200
	)

	with gr.Row():
	with gr.Column(scale=1, min_width=1200):
	category_plot = gr.Plot(
	value=create_category_comparison()
	)

	with gr.Row():
	with gr.Column(scale=1):
	model_selector = gr.Dropdown(
	choices=[m for m in list(MODELS.keys()) if m != "Ark II"],
	label="Select Model to Compare with Ark II",
	value="Claude-3-5-Sonnet",
	interactive=True
	)

	with gr.Row():
	with gr.Column(scale=2):
	metrics_table = gr.DataFrame(
	create_comparison_metrics_df("Claude-3-5-Sonnet"),
	label="Comparison Metrics (vs Ark II)",
	height=400
	)

	with gr.Row():
	with gr.Column(scale=1, min_width=1200):
	radar_chart = gr.Plot(value=create_combined_radar_chart())

	# Update callback
	model_selector.change(
	fn=update_model_details,
	inputs=[model_selector],
	outputs=[metrics_table, radar_chart]
	)



	if __name__ == "__main__":
	demo.launch(share=True)