File size: 3,449 Bytes
15188ef
 
91d4a22
 
 
15188ef
126a4c3
d715186
126a4c3
47a8f30
 
 
 
15188ef
91d4a22
 
 
fb0ec35
 
91d4a22
64830f2
91d4a22
15188ef
91d4a22
15188ef
91d4a22
 
 
 
 
15188ef
91d4a22
 
15188ef
91d4a22
 
 
 
15188ef
91d4a22
a1e5920
15188ef
 
 
 
0f936f8
912ca3f
e697cf2
91d4a22
15188ef
f7f1bb2
15188ef
 
91d4a22
2d11dba
 
15188ef
2d11dba
 
 
 
15188ef
2d11dba
3fee132
 
2d11dba
 
a7cf972
e72ba15
fa30761
68618b2
649d64d
e72ba15
649d64d
91d4a22
e91ebe6
af42bd2
e91ebe6
a1e5920
e91ebe6
af42bd2
e91ebe6
a1e5920
91d4a22
 
649d64d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import gradio as gr
import pandas as pd
import numpy as np
from collections import defaultdict
from gradio_leaderboard import Leaderboard, SelectColumns

# Load the DataFrame from the CSV files for detailed pass@k metrics
df = pd.read_csv('results.csv')
duo_df = pd.read_csv('results_duo.csv')

# Ensure 'Model' and 'Scenario' columns are strings
df['Model'] = df['Model'].astype(str)
df['Scenario'] = df['Scenario'].astype(str)

# Function to estimate pass@k
def estimate_pass_at_k(num_samples, num_correct, k):
    def estimator(n, c, k):
        if n < k:
            return np.nan 
        if n - c < k:
            return 1.0
        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

    return np.array([estimator(n, c, k) for n, c in zip(num_samples, num_correct)])

# Function to calculate pass@k
def calculate_pass_at_k(df, model, scenario, k_values=[1, 5, 10]):
    filtered_df = df[(df['Model'] == model) & (df['Scenario'] == scenario)]
    num_samples = filtered_df['Runs'].values
    num_correct = filtered_df['Successes'].values

    pass_at_k = {f"pass@{k}": estimate_pass_at_k(num_samples, num_correct, k).mean() for k in k_values}
    return pass_at_k

# Function to filter data and calculate pass@k
def filter_data(model, scenario):
    pass_at_k = calculate_pass_at_k(df, model, scenario)
    return pd.DataFrame([pass_at_k])

# Initialize the leaderboard
def init_leaderboard(dataframe, height=800):
    if dataframe is None or dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")
    return Leaderboard(
        value=dataframe,
        datatype=["markdown", "number", "number", "number"],  # Specify the types of your columns
        search_columns=["Model"],
        hide_columns=[],  # Columns to hide
        filter_columns=[],  # Filters for the columns
        interactive=False,
        height=height,
    )

# Gradio interface
#models = df['Model'].unique().tolist()
#scenarios = df['Scenario'].unique().tolist()

# Initialize leaderboard with the complete DataFrame
duo_complete_pass_at_k = duo_df.groupby('Model')[['Runs', 'Successes']].apply(lambda x: pd.Series({
    'pass@1': estimate_pass_at_k(x['Runs'].values, x['Successes'].values, 1).mean()
}, index=['pass@1'])).reset_index()

complete_pass_at_k = df.groupby('Model')[['Runs', 'Successes']].apply(lambda x: pd.Series({
    'pass@1': estimate_pass_at_k(x['Runs'].values, x['Successes'].values, 1).mean()
}, index=['pass@1'])).reset_index()
    
with gr.Blocks() as demo:
    gr.Markdown("# 🏆 WebApp1K Models Leaderboard")
    gr.Markdown(
        "## [Discord](https://discord.gg/3qpAbWC7) " +
        "[Papers](https://huggingface.co/onekq) " +
        "[Blog](https://huggingface.co/blog/onekq/all-llms-write-great-code) "
        "[Github](https://github.com/onekq/WebApp1k) " +
        "[AI Models](https://www.aimodels.fyi/papers/arxiv/webapp1k-practical-code-generation-benchmark-web-app)")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## WebApp1K-Duo ([Benchmark](https://huggingface.co/datasets/onekq-ai/WebApp1K-Duo-React))")
            duo_leaderboard = init_leaderboard(duo_complete_pass_at_k, height=800)

        with gr.Column(scale=1):
            gr.Markdown("## WebApp1K ([Benchmark](https://huggingface.co/datasets/onekq-ai/WebApp1K-React))")
            leaderboard = init_leaderboard(complete_pass_at_k, height=800)

# Launch the Gradio interface
demo.launch()