File size: 3,154 Bytes
359f755
77c0f20
359f755
 
 
 
 
 
 
 
 
 
 
 
 
 
77c0f20
 
70ea05e
359f755
 
 
 
77c0f20
359f755
77c0f20
 
 
 
 
359f755
 
70ea05e
 
 
 
 
 
 
 
 
 
 
 
77c0f20
 
 
 
 
 
 
 
70ea05e
77c0f20
 
 
 
359f755
 
 
 
 
 
77c0f20
359f755
 
77c0f20
359f755
 
77c0f20
359f755
 
77c0f20
 
359f755
70ea05e
 
 
 
 
 
77c0f20
 
70ea05e
77c0f20
70ea05e
77c0f20
 
359f755
 
77c0f20
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr
from gradio_leaderboard import Leaderboard
import pandas as pd
from huggingface_hub import snapshot_download

from src.about import (
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
    BENCHMARK_COLS,
    COLS,
    AutoEvalColumn,
)
from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN
from src.populate import get_leaderboard_df
from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval

def init_leaderboard(dataframe):
    if dataframe is None or dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")
    
    return Leaderboard(
        dataframe,
        headers=COLS,
        column_config={
            AutoEvalColumn.model.name: "markdown",
        },
    )

def run_perplexity_test(model_name, revision, precision):
    """Run perplexity evaluation on demand."""
    if not model_name:
        return "Please enter a model name."
    
    success, result = run_dynamic_perplexity_eval(model_name, revision, precision)
    
    if success:
        return f"βœ… Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\nResults have been saved and will appear in the leaderboard shortly."
    else:
        return f"❌ Evaluation failed: {result}"

# Initialize results directory
try:
    print(EVAL_RESULTS_PATH)
    snapshot_download(
        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
    )
except Exception as e:
    print(f"Error initializing results: {e}")

# Get initial leaderboard data
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)

# Create the Gradio interface
demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("πŸ… Leaderboard", elem_id="leaderboard-tab", id=0):
            leaderboard = init_leaderboard(LEADERBOARD_DF)

        with gr.TabItem("πŸ“ About", elem_id="about-tab", id=1):
            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

        with gr.TabItem("πŸ§ͺ Test Model", elem_id="test-model-tab", id=2):
            with gr.Row():
                with gr.Column():
                    model_name = gr.Textbox(label="Model name", placeholder="org/model-name")
                    revision = gr.Textbox(label="Revision", placeholder="main", value="main")
                    precision = gr.Dropdown(
                        choices=["float16", "bfloat16"],
                        label="Precision",
                        value="float16"
                    )
                
                with gr.Column():
                    test_button = gr.Button("πŸš€ Run Perplexity Test", variant="primary")
                    result = gr.Markdown()
            
            test_button.click(
                run_perplexity_test,
                [model_name, revision, precision],
                result
            )

demo.queue(default_concurrency_limit=5).launch()