File size: 2,850 Bytes
ea047ad
 
 
1a6cc70
67741f2
ea047ad
 
67741f2
 
ea047ad
67741f2
 
ea047ad
 
67741f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea047ad
 
67741f2
 
ea047ad
67741f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea047ad
 
 
67741f2
 
 
ea047ad
67741f2
 
ea047ad
67741f2
 
 
 
ea047ad
 
67741f2
 
 
 
 
ea047ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67741f2
 
 
 
 
 
ea047ad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import asyncio

from yourbench_space.leaderboard_space.env import INIT_MODELS


ON_SPACES = os.environ.get("system") == "spaces"
OUTPUT_DIR = "/data" if ON_SPACES else "."


def create_eval_file(eval_ds_name):
    # TODO: replace by Nathan's call
    content = (
        """
from aenum import extend_enum

from lighteval.metrics.metrics import Metrics
from lighteval.metrics.utils.metric_utils import (
    CorpusLevelMetricGrouping,
    MetricCategory,
    MetricUseCase,
)
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.extended.hle.main import JudgeLLMHLE
from lighteval.tasks.requests import Doc


def prompt_function(line, task_name: str = None):
    if line["image"] not in [None, ""]:
        return

    return Doc(
        task_name=task_name,
        query="Question: " + line["question"] + "\\nAnswer:",
        choices=[line["answer"]],
        gold_index=0,
        specific={"question": line["question"]},
    )
"""
        + f"""

hle = LightevalTaskConfig(
    name="{eval_ds_name.replace("/", "_")}",
    suite=["custom"],
    prompt_function=prompt_function,
    hf_repo="{eval_ds_name}",
    hf_subset="default",
    hf_avail_splits=["test"],
    evaluation_splits=["test"],
    few_shots_split=None,
    few_shots_select=None,
    generation_size=8192,
    metric=[Metrics.exact_match],
    stop_sequence=[],
    trust_dataset=True,
    version=0,
)


TASKS_TABLE = [hle]
"""
    )

    with open(f"{OUTPUT_DIR}/custom_task.py", "w") as f:
        f.write(content)


async def run_process(args: list) -> dict:
    process = await asyncio.create_subprocess_exec(
        *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
    )
    await asyncio.wait_for(process.wait(), timeout=180)
    stdout = await process.stdout.read()
    stderr = await process.stderr.read()
    return {"pid": process.pid, "stdout": stdout.decode(), "stderr": stderr.decode()}


async def run_evaluations(eval_ds_name: str, org: str) -> list:
    tasks = []
    for model_name, provider in INIT_MODELS:
        args = [
            "lighteval",
            "endpoint",
            "inference-providers",
            f"model={model_name},provider={provider}",
            f"custom|{eval_ds_name.replace('/', '_')}|0|0",
            "--custom-tasks",
            f"{OUTPUT_DIR}/custom_task.py",
            "--max-samples",
            "10",
            "--output-dir",
            f"{OUTPUT_DIR}",
            "--save-details",
            "--results-org",
            org,
            "--push-to-hub",
        ]
        tasks.append(run_process(args))
    # Will capture the task if failed
    processes = await asyncio.gather(*tasks, return_exceptions=True)
    if all(not isinstance(result, Exception) for result in processes):
        return "✅"
    return "At least one model failed"