Spaces:
Running
Running
Commit
·
d86ca68
1
Parent(s):
acb30f3
[MODIFY] Added support for other frameworks in submit, evaluation queue and harness results displau
Browse files- medic-harness-requests/.gitattributes +0 -58
- medic-harness-results/.gitattributes +0 -58
- medic-harness-results/aaditya/Llama3-OpenBioLLM-70B/results_2024-07-24T15:26:36Z.json +0 -37
- medic-harness-results/meta-llama/Llama-3.1-8B-Instruct/results_2024-07-24T15:26:36Z.json +0 -39
- src/display/utils.py +2 -2
- src/leaderboard/read_evals.py +7 -5
- src/populate.py +18 -4
- src/submission/submit.py +6 -1
medic-harness-requests/.gitattributes
DELETED
@@ -1,58 +0,0 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
26 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
27 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
36 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
37 |
-
# Audio files - uncompressed
|
38 |
-
*.pcm filter=lfs diff=lfs merge=lfs -text
|
39 |
-
*.sam filter=lfs diff=lfs merge=lfs -text
|
40 |
-
*.raw filter=lfs diff=lfs merge=lfs -text
|
41 |
-
# Audio files - compressed
|
42 |
-
*.aac filter=lfs diff=lfs merge=lfs -text
|
43 |
-
*.flac filter=lfs diff=lfs merge=lfs -text
|
44 |
-
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
45 |
-
*.ogg filter=lfs diff=lfs merge=lfs -text
|
46 |
-
*.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
-
# Image files - uncompressed
|
48 |
-
*.bmp filter=lfs diff=lfs merge=lfs -text
|
49 |
-
*.gif filter=lfs diff=lfs merge=lfs -text
|
50 |
-
*.png filter=lfs diff=lfs merge=lfs -text
|
51 |
-
*.tiff filter=lfs diff=lfs merge=lfs -text
|
52 |
-
# Image files - compressed
|
53 |
-
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
-
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
-
*.webp filter=lfs diff=lfs merge=lfs -text
|
56 |
-
# Video files - compressed
|
57 |
-
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
58 |
-
*.webm filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
medic-harness-results/.gitattributes
DELETED
@@ -1,58 +0,0 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
26 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
27 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
36 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
37 |
-
# Audio files - uncompressed
|
38 |
-
*.pcm filter=lfs diff=lfs merge=lfs -text
|
39 |
-
*.sam filter=lfs diff=lfs merge=lfs -text
|
40 |
-
*.raw filter=lfs diff=lfs merge=lfs -text
|
41 |
-
# Audio files - compressed
|
42 |
-
*.aac filter=lfs diff=lfs merge=lfs -text
|
43 |
-
*.flac filter=lfs diff=lfs merge=lfs -text
|
44 |
-
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
45 |
-
*.ogg filter=lfs diff=lfs merge=lfs -text
|
46 |
-
*.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
-
# Image files - uncompressed
|
48 |
-
*.bmp filter=lfs diff=lfs merge=lfs -text
|
49 |
-
*.gif filter=lfs diff=lfs merge=lfs -text
|
50 |
-
*.png filter=lfs diff=lfs merge=lfs -text
|
51 |
-
*.tiff filter=lfs diff=lfs merge=lfs -text
|
52 |
-
# Image files - compressed
|
53 |
-
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
-
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
-
*.webp filter=lfs diff=lfs merge=lfs -text
|
56 |
-
# Video files - compressed
|
57 |
-
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
58 |
-
*.webm filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
medic-harness-results/aaditya/Llama3-OpenBioLLM-70B/results_2024-07-24T15:26:36Z.json
DELETED
@@ -1,37 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config": {
|
3 |
-
"model_name": "aaditya/Llama3-OpenBioLLM-70B",
|
4 |
-
"revision": "main",
|
5 |
-
"submitted_time": "2024-07-24 14:33:56+00:00",
|
6 |
-
"model_type": "domain-specific",
|
7 |
-
"num_params": 70000000000,
|
8 |
-
"private": false,
|
9 |
-
"evaluated_time": "2024-07-24T15:26:36Z"
|
10 |
-
},
|
11 |
-
"results": {
|
12 |
-
"MMLU": {
|
13 |
-
"accuracy": 90.4
|
14 |
-
},
|
15 |
-
"MMLU-Pro": {
|
16 |
-
"accuracy": 64.2
|
17 |
-
},
|
18 |
-
"MedMCQA": {
|
19 |
-
"accuracy": 73.2
|
20 |
-
},
|
21 |
-
"MedQA": {
|
22 |
-
"accuracy": 76.9
|
23 |
-
},
|
24 |
-
"USMLE": {
|
25 |
-
"accuracy": 79.0
|
26 |
-
},
|
27 |
-
"PubMedQA": {
|
28 |
-
"accuracy": 73.2
|
29 |
-
},
|
30 |
-
"ToxiGen": {
|
31 |
-
"accuracy": 91.3
|
32 |
-
},
|
33 |
-
"Average": {
|
34 |
-
"accuracy": 78.3
|
35 |
-
}
|
36 |
-
}
|
37 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
medic-harness-results/meta-llama/Llama-3.1-8B-Instruct/results_2024-07-24T15:26:36Z.json
DELETED
@@ -1,39 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config": {
|
3 |
-
"model_name": "meta-llama/Llama-3.1-8B-Instruct",
|
4 |
-
"revision": "main",
|
5 |
-
"submitted_time": "2024-07-24 14:33:56+00:00",
|
6 |
-
"model_type": "instruction-tuned",
|
7 |
-
"num_params": 8000000000,
|
8 |
-
"private": false,
|
9 |
-
"evaluated_time": "2024-07-24T15:26:36Z"
|
10 |
-
},
|
11 |
-
"results": {
|
12 |
-
"MMLU": {
|
13 |
-
"accuracy": 73.4
|
14 |
-
},
|
15 |
-
"MMLU-Pro": {
|
16 |
-
"accuracy": 49.9
|
17 |
-
},
|
18 |
-
"MedMCQA": {
|
19 |
-
"accuracy": 58.4
|
20 |
-
},
|
21 |
-
"MedQA": {
|
22 |
-
"accuracy": 62.0
|
23 |
-
},
|
24 |
-
"USMLE": {
|
25 |
-
"accuracy": 68.2
|
26 |
-
},
|
27 |
-
"PubMedQA": {
|
28 |
-
"accuracy": 76.2
|
29 |
-
},
|
30 |
-
"ToxiGen": {
|
31 |
-
"accuracy": 82.3
|
32 |
-
},
|
33 |
-
"Average": {
|
34 |
-
"accuracy": 67.2
|
35 |
-
}
|
36 |
-
}
|
37 |
-
}
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/utils.py
CHANGED
@@ -61,8 +61,8 @@ class EvalQueueColumn: # Queue column
|
|
61 |
model_type = ColumnContent("model_type", "str", True)
|
62 |
precision = ColumnContent("precision", "str", True)
|
63 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
64 |
-
|
65 |
-
|
66 |
|
67 |
## All the model information that we might need
|
68 |
@dataclass
|
|
|
61 |
model_type = ColumnContent("model_type", "str", True)
|
62 |
precision = ColumnContent("precision", "str", True)
|
63 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
64 |
+
closed_ended_status = ColumnContent("closed_ended_status", "str", True)
|
65 |
+
open_ended_status = ColumnContent("open_ended_status", "str", True)
|
66 |
|
67 |
## All the model information that we might need
|
68 |
@dataclass
|
src/leaderboard/read_evals.py
CHANGED
@@ -76,18 +76,20 @@ class EvalResult:
|
|
76 |
backbone = ";".join(backbones)
|
77 |
|
78 |
# Extract results available in this file (some results are split in several files)
|
79 |
-
|
80 |
for task in HarnessTasks:
|
81 |
task = task.value
|
82 |
|
83 |
# We average all scores of a given metric (not all metrics are present in all files)
|
84 |
-
|
|
|
|
|
|
|
85 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
86 |
continue
|
87 |
|
88 |
mean_acc = np.mean(accs) # * 100.0
|
89 |
-
|
90 |
-
print(dataset_results)
|
91 |
# types_results = {}
|
92 |
# for clinical_type in ClinicalTypes:
|
93 |
# clinical_type = clinical_type.value
|
@@ -106,7 +108,7 @@ class EvalResult:
|
|
106 |
org=org,
|
107 |
model=model,
|
108 |
revision=config.get("revision", ""),
|
109 |
-
dataset_results=
|
110 |
is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
|
111 |
use_chat_template=config.get("use_chat_template", False), # Assuming a default value
|
112 |
precision=precision,
|
|
|
76 |
backbone = ";".join(backbones)
|
77 |
|
78 |
# Extract results available in this file (some results are split in several files)
|
79 |
+
harness_results = {}
|
80 |
for task in HarnessTasks:
|
81 |
task = task.value
|
82 |
|
83 |
# We average all scores of a given metric (not all metrics are present in all files)
|
84 |
+
try:
|
85 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"]["closed-ended"].items() if task.benchmark == k])
|
86 |
+
except:
|
87 |
+
breakpoint()
|
88 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
89 |
continue
|
90 |
|
91 |
mean_acc = np.mean(accs) # * 100.0
|
92 |
+
harness_results[task.benchmark] = mean_acc
|
|
|
93 |
# types_results = {}
|
94 |
# for clinical_type in ClinicalTypes:
|
95 |
# clinical_type = clinical_type.value
|
|
|
108 |
org=org,
|
109 |
model=model,
|
110 |
revision=config.get("revision", ""),
|
111 |
+
dataset_results=harness_results,
|
112 |
is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
|
113 |
use_chat_template=config.get("use_chat_template", False), # Assuming a default value
|
114 |
precision=precision,
|
src/populate.py
CHANGED
@@ -36,6 +36,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
36 |
data = json.load(fp)
|
37 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
|
38 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
|
|
|
|
39 |
all_evals.append(data)
|
40 |
elif ".md" not in entry:
|
41 |
# this is a folder
|
@@ -47,11 +49,23 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
47 |
# print(data)
|
48 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
|
49 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
|
|
|
|
50 |
all_evals.append(data)
|
51 |
-
|
52 |
-
pending_list = [
|
53 |
-
running_list = [
|
54 |
-
finished_list = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
56 |
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
57 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
|
|
36 |
data = json.load(fp)
|
37 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
|
38 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
39 |
+
data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
|
40 |
+
data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
|
41 |
all_evals.append(data)
|
42 |
elif ".md" not in entry:
|
43 |
# this is a folder
|
|
|
49 |
# print(data)
|
50 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
|
51 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
52 |
+
data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
|
53 |
+
data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
|
54 |
all_evals.append(data)
|
55 |
+
# breakpoint()
|
56 |
+
pending_list = []
|
57 |
+
running_list = []
|
58 |
+
finished_list = []
|
59 |
+
for run in all_evals:
|
60 |
+
status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["cross-examination"]]
|
61 |
+
status_list = status_list[:2]
|
62 |
+
if "RUNNING" in status_list:
|
63 |
+
running_list.append(run)
|
64 |
+
elif "PENDING" in status_list or "RERUN" in status_list:
|
65 |
+
pending_list.append(run)
|
66 |
+
else:
|
67 |
+
finished_list.append(run)
|
68 |
+
# breakpoint()
|
69 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
70 |
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
71 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
src/submission/submit.py
CHANGED
@@ -135,7 +135,12 @@ def add_new_eval(
|
|
135 |
"weight_type": weight_type,
|
136 |
"is_domain_specific": domain_specific,
|
137 |
"use_chat_template": chat_template,
|
138 |
-
"status":
|
|
|
|
|
|
|
|
|
|
|
139 |
"submitted_time": current_time,
|
140 |
"model_type": model_type,
|
141 |
"likes": model_info.likes,
|
|
|
135 |
"weight_type": weight_type,
|
136 |
"is_domain_specific": domain_specific,
|
137 |
"use_chat_template": chat_template,
|
138 |
+
"status": {
|
139 |
+
"closed-ended": "PENDING",
|
140 |
+
"open-ended": "PENDING",
|
141 |
+
"med-safety": "PENDING",
|
142 |
+
"cross-examination": "PENDING",
|
143 |
+
},
|
144 |
"submitted_time": current_time,
|
145 |
"model_type": model_type,
|
146 |
"likes": model_info.likes,
|