tathagataraha commited on
Commit
d86ca68
·
1 Parent(s): acb30f3

[MODIFY] Added support for other frameworks in submit, evaluation queue and harness results displau

Browse files
medic-harness-requests/.gitattributes DELETED
@@ -1,58 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.lz4 filter=lfs diff=lfs merge=lfs -text
12
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
- *.model filter=lfs diff=lfs merge=lfs -text
14
- *.msgpack filter=lfs diff=lfs merge=lfs -text
15
- *.npy filter=lfs diff=lfs merge=lfs -text
16
- *.npz filter=lfs diff=lfs merge=lfs -text
17
- *.onnx filter=lfs diff=lfs merge=lfs -text
18
- *.ot filter=lfs diff=lfs merge=lfs -text
19
- *.parquet filter=lfs diff=lfs merge=lfs -text
20
- *.pb filter=lfs diff=lfs merge=lfs -text
21
- *.pickle filter=lfs diff=lfs merge=lfs -text
22
- *.pkl filter=lfs diff=lfs merge=lfs -text
23
- *.pt filter=lfs diff=lfs merge=lfs -text
24
- *.pth filter=lfs diff=lfs merge=lfs -text
25
- *.rar filter=lfs diff=lfs merge=lfs -text
26
- *.safetensors filter=lfs diff=lfs merge=lfs -text
27
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
- *.tar.* filter=lfs diff=lfs merge=lfs -text
29
- *.tar filter=lfs diff=lfs merge=lfs -text
30
- *.tflite filter=lfs diff=lfs merge=lfs -text
31
- *.tgz filter=lfs diff=lfs merge=lfs -text
32
- *.wasm filter=lfs diff=lfs merge=lfs -text
33
- *.xz filter=lfs diff=lfs merge=lfs -text
34
- *.zip filter=lfs diff=lfs merge=lfs -text
35
- *.zst filter=lfs diff=lfs merge=lfs -text
36
- *tfevents* filter=lfs diff=lfs merge=lfs -text
37
- # Audio files - uncompressed
38
- *.pcm filter=lfs diff=lfs merge=lfs -text
39
- *.sam filter=lfs diff=lfs merge=lfs -text
40
- *.raw filter=lfs diff=lfs merge=lfs -text
41
- # Audio files - compressed
42
- *.aac filter=lfs diff=lfs merge=lfs -text
43
- *.flac filter=lfs diff=lfs merge=lfs -text
44
- *.mp3 filter=lfs diff=lfs merge=lfs -text
45
- *.ogg filter=lfs diff=lfs merge=lfs -text
46
- *.wav filter=lfs diff=lfs merge=lfs -text
47
- # Image files - uncompressed
48
- *.bmp filter=lfs diff=lfs merge=lfs -text
49
- *.gif filter=lfs diff=lfs merge=lfs -text
50
- *.png filter=lfs diff=lfs merge=lfs -text
51
- *.tiff filter=lfs diff=lfs merge=lfs -text
52
- # Image files - compressed
53
- *.jpg filter=lfs diff=lfs merge=lfs -text
54
- *.jpeg filter=lfs diff=lfs merge=lfs -text
55
- *.webp filter=lfs diff=lfs merge=lfs -text
56
- # Video files - compressed
57
- *.mp4 filter=lfs diff=lfs merge=lfs -text
58
- *.webm filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
medic-harness-results/.gitattributes DELETED
@@ -1,58 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.lz4 filter=lfs diff=lfs merge=lfs -text
12
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
- *.model filter=lfs diff=lfs merge=lfs -text
14
- *.msgpack filter=lfs diff=lfs merge=lfs -text
15
- *.npy filter=lfs diff=lfs merge=lfs -text
16
- *.npz filter=lfs diff=lfs merge=lfs -text
17
- *.onnx filter=lfs diff=lfs merge=lfs -text
18
- *.ot filter=lfs diff=lfs merge=lfs -text
19
- *.parquet filter=lfs diff=lfs merge=lfs -text
20
- *.pb filter=lfs diff=lfs merge=lfs -text
21
- *.pickle filter=lfs diff=lfs merge=lfs -text
22
- *.pkl filter=lfs diff=lfs merge=lfs -text
23
- *.pt filter=lfs diff=lfs merge=lfs -text
24
- *.pth filter=lfs diff=lfs merge=lfs -text
25
- *.rar filter=lfs diff=lfs merge=lfs -text
26
- *.safetensors filter=lfs diff=lfs merge=lfs -text
27
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
- *.tar.* filter=lfs diff=lfs merge=lfs -text
29
- *.tar filter=lfs diff=lfs merge=lfs -text
30
- *.tflite filter=lfs diff=lfs merge=lfs -text
31
- *.tgz filter=lfs diff=lfs merge=lfs -text
32
- *.wasm filter=lfs diff=lfs merge=lfs -text
33
- *.xz filter=lfs diff=lfs merge=lfs -text
34
- *.zip filter=lfs diff=lfs merge=lfs -text
35
- *.zst filter=lfs diff=lfs merge=lfs -text
36
- *tfevents* filter=lfs diff=lfs merge=lfs -text
37
- # Audio files - uncompressed
38
- *.pcm filter=lfs diff=lfs merge=lfs -text
39
- *.sam filter=lfs diff=lfs merge=lfs -text
40
- *.raw filter=lfs diff=lfs merge=lfs -text
41
- # Audio files - compressed
42
- *.aac filter=lfs diff=lfs merge=lfs -text
43
- *.flac filter=lfs diff=lfs merge=lfs -text
44
- *.mp3 filter=lfs diff=lfs merge=lfs -text
45
- *.ogg filter=lfs diff=lfs merge=lfs -text
46
- *.wav filter=lfs diff=lfs merge=lfs -text
47
- # Image files - uncompressed
48
- *.bmp filter=lfs diff=lfs merge=lfs -text
49
- *.gif filter=lfs diff=lfs merge=lfs -text
50
- *.png filter=lfs diff=lfs merge=lfs -text
51
- *.tiff filter=lfs diff=lfs merge=lfs -text
52
- # Image files - compressed
53
- *.jpg filter=lfs diff=lfs merge=lfs -text
54
- *.jpeg filter=lfs diff=lfs merge=lfs -text
55
- *.webp filter=lfs diff=lfs merge=lfs -text
56
- # Video files - compressed
57
- *.mp4 filter=lfs diff=lfs merge=lfs -text
58
- *.webm filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
medic-harness-results/aaditya/Llama3-OpenBioLLM-70B/results_2024-07-24T15:26:36Z.json DELETED
@@ -1,37 +0,0 @@
1
- {
2
- "config": {
3
- "model_name": "aaditya/Llama3-OpenBioLLM-70B",
4
- "revision": "main",
5
- "submitted_time": "2024-07-24 14:33:56+00:00",
6
- "model_type": "domain-specific",
7
- "num_params": 70000000000,
8
- "private": false,
9
- "evaluated_time": "2024-07-24T15:26:36Z"
10
- },
11
- "results": {
12
- "MMLU": {
13
- "accuracy": 90.4
14
- },
15
- "MMLU-Pro": {
16
- "accuracy": 64.2
17
- },
18
- "MedMCQA": {
19
- "accuracy": 73.2
20
- },
21
- "MedQA": {
22
- "accuracy": 76.9
23
- },
24
- "USMLE": {
25
- "accuracy": 79.0
26
- },
27
- "PubMedQA": {
28
- "accuracy": 73.2
29
- },
30
- "ToxiGen": {
31
- "accuracy": 91.3
32
- },
33
- "Average": {
34
- "accuracy": 78.3
35
- }
36
- }
37
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
medic-harness-results/meta-llama/Llama-3.1-8B-Instruct/results_2024-07-24T15:26:36Z.json DELETED
@@ -1,39 +0,0 @@
1
- {
2
- "config": {
3
- "model_name": "meta-llama/Llama-3.1-8B-Instruct",
4
- "revision": "main",
5
- "submitted_time": "2024-07-24 14:33:56+00:00",
6
- "model_type": "instruction-tuned",
7
- "num_params": 8000000000,
8
- "private": false,
9
- "evaluated_time": "2024-07-24T15:26:36Z"
10
- },
11
- "results": {
12
- "MMLU": {
13
- "accuracy": 73.4
14
- },
15
- "MMLU-Pro": {
16
- "accuracy": 49.9
17
- },
18
- "MedMCQA": {
19
- "accuracy": 58.4
20
- },
21
- "MedQA": {
22
- "accuracy": 62.0
23
- },
24
- "USMLE": {
25
- "accuracy": 68.2
26
- },
27
- "PubMedQA": {
28
- "accuracy": 76.2
29
- },
30
- "ToxiGen": {
31
- "accuracy": 82.3
32
- },
33
- "Average": {
34
- "accuracy": 67.2
35
- }
36
- }
37
- }
38
-
39
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/utils.py CHANGED
@@ -61,8 +61,8 @@ class EvalQueueColumn: # Queue column
61
  model_type = ColumnContent("model_type", "str", True)
62
  precision = ColumnContent("precision", "str", True)
63
  weight_type = ColumnContent("weight_type", "str", "Original")
64
- status = ColumnContent("status", "str", True)
65
-
66
 
67
  ## All the model information that we might need
68
  @dataclass
 
61
  model_type = ColumnContent("model_type", "str", True)
62
  precision = ColumnContent("precision", "str", True)
63
  weight_type = ColumnContent("weight_type", "str", "Original")
64
+ closed_ended_status = ColumnContent("closed_ended_status", "str", True)
65
+ open_ended_status = ColumnContent("open_ended_status", "str", True)
66
 
67
  ## All the model information that we might need
68
  @dataclass
src/leaderboard/read_evals.py CHANGED
@@ -76,18 +76,20 @@ class EvalResult:
76
  backbone = ";".join(backbones)
77
 
78
  # Extract results available in this file (some results are split in several files)
79
- dataset_results = {}
80
  for task in HarnessTasks:
81
  task = task.value
82
 
83
  # We average all scores of a given metric (not all metrics are present in all files)
84
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
 
 
 
85
  if accs.size == 0 or any([acc is None for acc in accs]):
86
  continue
87
 
88
  mean_acc = np.mean(accs) # * 100.0
89
- dataset_results[task.benchmark] = mean_acc
90
- print(dataset_results)
91
  # types_results = {}
92
  # for clinical_type in ClinicalTypes:
93
  # clinical_type = clinical_type.value
@@ -106,7 +108,7 @@ class EvalResult:
106
  org=org,
107
  model=model,
108
  revision=config.get("revision", ""),
109
- dataset_results=dataset_results,
110
  is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
111
  use_chat_template=config.get("use_chat_template", False), # Assuming a default value
112
  precision=precision,
 
76
  backbone = ";".join(backbones)
77
 
78
  # Extract results available in this file (some results are split in several files)
79
+ harness_results = {}
80
  for task in HarnessTasks:
81
  task = task.value
82
 
83
  # We average all scores of a given metric (not all metrics are present in all files)
84
+ try:
85
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"]["closed-ended"].items() if task.benchmark == k])
86
+ except:
87
+ breakpoint()
88
  if accs.size == 0 or any([acc is None for acc in accs]):
89
  continue
90
 
91
  mean_acc = np.mean(accs) # * 100.0
92
+ harness_results[task.benchmark] = mean_acc
 
93
  # types_results = {}
94
  # for clinical_type in ClinicalTypes:
95
  # clinical_type = clinical_type.value
 
108
  org=org,
109
  model=model,
110
  revision=config.get("revision", ""),
111
+ dataset_results=harness_results,
112
  is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
113
  use_chat_template=config.get("use_chat_template", False), # Assuming a default value
114
  precision=precision,
src/populate.py CHANGED
@@ -36,6 +36,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
36
  data = json.load(fp)
37
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
38
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
 
 
39
  all_evals.append(data)
40
  elif ".md" not in entry:
41
  # this is a folder
@@ -47,11 +49,23 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
47
  # print(data)
48
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
49
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
 
 
50
  all_evals.append(data)
51
-
52
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
 
 
 
 
 
 
 
 
 
 
55
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
 
36
  data = json.load(fp)
37
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
38
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
39
+ data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
40
+ data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
41
  all_evals.append(data)
42
  elif ".md" not in entry:
43
  # this is a folder
 
49
  # print(data)
50
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
51
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
52
+ data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
53
+ data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
54
  all_evals.append(data)
55
+ # breakpoint()
56
+ pending_list = []
57
+ running_list = []
58
+ finished_list = []
59
+ for run in all_evals:
60
+ status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["cross-examination"]]
61
+ status_list = status_list[:2]
62
+ if "RUNNING" in status_list:
63
+ running_list.append(run)
64
+ elif "PENDING" in status_list or "RERUN" in status_list:
65
+ pending_list.append(run)
66
+ else:
67
+ finished_list.append(run)
68
+ # breakpoint()
69
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
70
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
71
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
src/submission/submit.py CHANGED
@@ -135,7 +135,12 @@ def add_new_eval(
135
  "weight_type": weight_type,
136
  "is_domain_specific": domain_specific,
137
  "use_chat_template": chat_template,
138
- "status": "PENDING",
 
 
 
 
 
139
  "submitted_time": current_time,
140
  "model_type": model_type,
141
  "likes": model_info.likes,
 
135
  "weight_type": weight_type,
136
  "is_domain_specific": domain_specific,
137
  "use_chat_template": chat_template,
138
+ "status": {
139
+ "closed-ended": "PENDING",
140
+ "open-ended": "PENDING",
141
+ "med-safety": "PENDING",
142
+ "cross-examination": "PENDING",
143
+ },
144
  "submitted_time": current_time,
145
  "model_type": model_type,
146
  "likes": model_info.likes,