File size: 3,449 Bytes
15188ef 91d4a22 15188ef 126a4c3 d715186 126a4c3 47a8f30 15188ef 91d4a22 fb0ec35 91d4a22 64830f2 91d4a22 15188ef 91d4a22 15188ef 91d4a22 15188ef 91d4a22 15188ef 91d4a22 15188ef 91d4a22 a1e5920 15188ef 0f936f8 912ca3f e697cf2 91d4a22 15188ef f7f1bb2 15188ef 91d4a22 2d11dba 15188ef 2d11dba 15188ef 2d11dba 3fee132 2d11dba a7cf972 e72ba15 fa30761 68618b2 649d64d e72ba15 649d64d 91d4a22 e91ebe6 af42bd2 e91ebe6 a1e5920 e91ebe6 af42bd2 e91ebe6 a1e5920 91d4a22 649d64d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import gradio as gr
import pandas as pd
import numpy as np
from collections import defaultdict
from gradio_leaderboard import Leaderboard, SelectColumns
# Load the DataFrame from the CSV files for detailed pass@k metrics
df = pd.read_csv('results.csv')
duo_df = pd.read_csv('results_duo.csv')
# Ensure 'Model' and 'Scenario' columns are strings
df['Model'] = df['Model'].astype(str)
df['Scenario'] = df['Scenario'].astype(str)
# Function to estimate pass@k
def estimate_pass_at_k(num_samples, num_correct, k):
def estimator(n, c, k):
if n < k:
return np.nan
if n - c < k:
return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
return np.array([estimator(n, c, k) for n, c in zip(num_samples, num_correct)])
# Function to calculate pass@k
def calculate_pass_at_k(df, model, scenario, k_values=[1, 5, 10]):
filtered_df = df[(df['Model'] == model) & (df['Scenario'] == scenario)]
num_samples = filtered_df['Runs'].values
num_correct = filtered_df['Successes'].values
pass_at_k = {f"pass@{k}": estimate_pass_at_k(num_samples, num_correct, k).mean() for k in k_values}
return pass_at_k
# Function to filter data and calculate pass@k
def filter_data(model, scenario):
pass_at_k = calculate_pass_at_k(df, model, scenario)
return pd.DataFrame([pass_at_k])
# Initialize the leaderboard
def init_leaderboard(dataframe, height=800):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
return Leaderboard(
value=dataframe,
datatype=["markdown", "number", "number", "number"], # Specify the types of your columns
search_columns=["Model"],
hide_columns=[], # Columns to hide
filter_columns=[], # Filters for the columns
interactive=False,
height=height,
)
# Gradio interface
#models = df['Model'].unique().tolist()
#scenarios = df['Scenario'].unique().tolist()
# Initialize leaderboard with the complete DataFrame
duo_complete_pass_at_k = duo_df.groupby('Model')[['Runs', 'Successes']].apply(lambda x: pd.Series({
'pass@1': estimate_pass_at_k(x['Runs'].values, x['Successes'].values, 1).mean()
}, index=['pass@1'])).reset_index()
complete_pass_at_k = df.groupby('Model')[['Runs', 'Successes']].apply(lambda x: pd.Series({
'pass@1': estimate_pass_at_k(x['Runs'].values, x['Successes'].values, 1).mean()
}, index=['pass@1'])).reset_index()
with gr.Blocks() as demo:
gr.Markdown("# 🏆 WebApp1K Models Leaderboard")
gr.Markdown(
"## [Discord](https://discord.gg/3qpAbWC7) " +
"[Papers](https://huggingface.co/onekq) " +
"[Blog](https://huggingface.co/blog/onekq/all-llms-write-great-code) "
"[Github](https://github.com/onekq/WebApp1k) " +
"[AI Models](https://www.aimodels.fyi/papers/arxiv/webapp1k-practical-code-generation-benchmark-web-app)")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## WebApp1K-Duo ([Benchmark](https://huggingface.co/datasets/onekq-ai/WebApp1K-Duo-React))")
duo_leaderboard = init_leaderboard(duo_complete_pass_at_k, height=800)
with gr.Column(scale=1):
gr.Markdown("## WebApp1K ([Benchmark](https://huggingface.co/datasets/onekq-ai/WebApp1K-React))")
leaderboard = init_leaderboard(complete_pass_at_k, height=800)
# Launch the Gradio interface
demo.launch() |