File size: 4,376 Bytes
28d348e
 
 
 
 
 
 
 
 
 
37f8510
 
28d348e
37f8510
 
28d348e
 
 
 
 
 
 
 
 
 
 
37f8510
28d348e
 
37f8510
 
28d348e
 
37f8510
28d348e
 
 
 
 
 
 
37f8510
 
 
28d348e
 
 
 
 
 
 
 
 
 
37f8510
 
 
 
 
 
 
28d348e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37f8510
 
 
 
28d348e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import ast
import json
import urllib

import gradio as gr

import pandas as pd
from datasets import load_dataset
from gradio_leaderboard import Leaderboard, SelectColumns, SearchColumns, ColumnFilter

df = pd.read_json("https://raw.githubusercontent.com/huggingface/lighteval/main/src/lighteval/tasks/tasks_table.jsonl",
                  lines=True).explode("suite").reset_index(drop=True)

with urllib.request.urlopen(
        "https://raw.githubusercontent.com/huggingface/lighteval/main/src/lighteval/tasks/tasks_prompt_formatting.py") as f:
    tasks_prompt_functions_raw = f.read().decode('utf-8')
    tree = ast.parse(tasks_prompt_functions_raw)
    tasks_prompt_functions = {}

    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef):
            function_name = node.name
            # Get the source code for the function node
            function_code = ast.get_source_segment(tasks_prompt_functions_raw, node)
            tasks_prompt_functions[function_name] = function_code


def load_task_metadata(task_id):
    task_row = df.iloc[task_id]
    return (task_row.to_dict(),
            f"""Examples from the HF repository ([{task_row['hf_repo']}](https://huggingface.co/datasets/{task_row['hf_repo']}))""",
            tasks_prompt_functions.get(task_row["prompt_function"]), "unknown")


def load_task_examples(task_id):
    task_row = df.iloc[task_id]
    dataset = load_dataset(task_row["hf_repo"], task_row["hf_subset"], split="+".join(task_row["evaluation_splits"]),
                           trust_remote_code=task_row["trust_dataset"], streaming=True)

    sample_data = next(dataset.iter(20))
    # dictionary of lists to list of dictionaries
    return pd.DataFrame(
        dict(zip(sample_data, t if not isinstance(t, dict) and not isinstance(t, list) else json.dumps(t)))
        for t in zip(*sample_data.values()))


with gr.Blocks() as demo:
    gr.Markdown("""
    # LightEval Tasks Explorer
    """)
    with gr.Tabs() as tabs:
        with gr.TabItem("🗃️ Tasks List"):
            Leaderboard(
                value=df,
                select_columns=SelectColumns(
                    default_selection=["name", "suite", "prompt_function", "hf_repo", "hf_subset", "evaluation_splits",
                                       "metric"],
                    cant_deselect=["name", "suite"],
                    label="Columns to display"),
                search_columns=SearchColumns(primary_column="name",
                                             secondary_columns=["suite", "prompt_function", "hf_repo", "metric"],
                                             placeholder="Search for a task by name, suite, prompt_function, hf_repo or "
                                                         "metric. To search by suite, for example, type 'suite:<query>'. Separate queries by \";\"",
                                             label="Search"),
                filter_columns=[
                    ColumnFilter("suite", type="dropdown", label="Select suite"),
                    # ColumnFilter("prompt_function", type="dropdown", label="Select prompt_function"),
                    # ColumnFilter("metric", type="dropdown", label="Select metric")
                ],
                wrap=True
            )
        with gr.TabItem("🔎 Task Inspector"):
            task_inspector_selector = gr.Dropdown(
                choices=sorted(zip((df['suite'] + '|' + df['name']).tolist(), range(len(df)))),
                label="Task",
                info="Select a task"
            )
            with gr.Row():
                with gr.Column():
                    task_metadata = gr.Json(label="Task definition")
                with gr.Column():
                    task_prompt_function = gr.Code(label="Task prompt function", language="python", interactive=False)
            task_dataset_header = gr.Markdown("Examples from the HF repository")
            task_dataset = gr.Dataframe(wrap=True)

            gr.on(triggers=[task_inspector_selector.change], inputs=[task_inspector_selector],
                  outputs=[task_metadata, task_dataset_header, task_prompt_function], fn=load_task_metadata)
            gr.on(triggers=[task_inspector_selector.change], inputs=[task_inspector_selector], outputs=[task_dataset],
                  fn=load_task_examples)

if __name__ == "__main__":
    demo.launch()