Spaces:
Running
Running
updated scores algorithm
Browse files- utils_v2.py +39 -32
utils_v2.py
CHANGED
|
@@ -66,17 +66,9 @@ def load_data(base_dir=SCORE_BASE_DIR):
|
|
| 66 |
all_data.append(data)
|
| 67 |
return all_data
|
| 68 |
|
| 69 |
-
def
|
| 70 |
-
"""This function
|
| 71 |
-
|
| 72 |
-
"""
|
| 73 |
-
def get_avg(sum_score, leng):
|
| 74 |
-
avg = sum_score / leng if leng > 0 else 0.0
|
| 75 |
-
avg = round(avg, 2) # Round to 2 decimal places
|
| 76 |
-
return avg
|
| 77 |
-
|
| 78 |
-
avg_scores = {}
|
| 79 |
-
overall_scores_summary = {} # Stores the scores sum and length for each modality and all datasets
|
| 80 |
for modality, datasets_list in DATASETS.items(): # Ex.: ('image', {'I-CLS': [...], 'I-QA': [...]})
|
| 81 |
overall_scores_summary[modality] = (0.0, 0) # Initialize the sum and count for each modality
|
| 82 |
for sub_task, datasets in datasets_list.items(): # Ex.: ('I-CLS', ['VOC2007', 'N24News', ...])
|
|
@@ -87,26 +79,42 @@ def calculate_score(raw_scores=None):
|
|
| 87 |
metric = SPECIAL_METRICS.get(dataset, 'hit@1')
|
| 88 |
if isinstance(score, dict):
|
| 89 |
score = score.get(metric, 0.0)
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
def generate_model_row(data):
|
| 112 |
metadata = data['metadata']
|
|
@@ -127,7 +135,6 @@ def get_df():
|
|
| 127 |
df = df.sort_values(by='Overall', ascending=False).reset_index(drop=True)
|
| 128 |
df['Rank'] = range(1, len(df) + 1)
|
| 129 |
df = create_hyperlinked_names(df)
|
| 130 |
-
|
| 131 |
return df
|
| 132 |
|
| 133 |
def refresh_data():
|
|
|
|
| 66 |
all_data.append(data)
|
| 67 |
return all_data
|
| 68 |
|
| 69 |
+
def load_scores(raw_scores=None):
|
| 70 |
+
"""This function loads the raw scores from the user provided scores summary and flattens them into a single dictionary."""
|
| 71 |
+
all_scores = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
for modality, datasets_list in DATASETS.items(): # Ex.: ('image', {'I-CLS': [...], 'I-QA': [...]})
|
| 73 |
overall_scores_summary[modality] = (0.0, 0) # Initialize the sum and count for each modality
|
| 74 |
for sub_task, datasets in datasets_list.items(): # Ex.: ('I-CLS', ['VOC2007', 'N24News', ...])
|
|
|
|
| 79 |
metric = SPECIAL_METRICS.get(dataset, 'hit@1')
|
| 80 |
if isinstance(score, dict):
|
| 81 |
score = score.get(metric, 0.0)
|
| 82 |
+
single_dataset_score = {'dataset': dataset, 'score': score}
|
| 83 |
+
all_scores.update(single_dataset_score)
|
| 84 |
+
return all_scores
|
| 85 |
+
|
| 86 |
+
def calculate_score(raw_scores=None):
|
| 87 |
+
"""This function calculates the overall average scores for all datasets as well as avg scores for each modality and sub-task based on the raw scores.
|
| 88 |
+
"""
|
| 89 |
+
def get_avg(sum_score, leng):
|
| 90 |
+
avg = sum_score / leng if leng > 0 else 0.0
|
| 91 |
+
avg = round(avg, 2) # Round to 2 decimal places
|
| 92 |
+
return avg
|
| 93 |
+
|
| 94 |
+
all_scores = load_scores(raw_scores)
|
| 95 |
+
avg_scores = {}
|
| 96 |
+
|
| 97 |
+
# Calculate overall score for all datasets
|
| 98 |
+
avg_scores['Overall'] = get_avg(sum(
|
| 99 |
+
all_scores.values()),
|
| 100 |
+
len(ALL_DATASETS))
|
| 101 |
+
|
| 102 |
+
# Calculate scores for each modality
|
| 103 |
+
for modality in MODALITIES:
|
| 104 |
+
datasets_for_each_modality = ALL_DATASETS_SPLITS.get(modality, [])
|
| 105 |
+
avg_scores[f"{modality.capitalize()}-Overall"] = get_avg(
|
| 106 |
+
sum(all_scores.get(dataset, 0.0) for dataset in datasets_for_each_modality),
|
| 107 |
+
len(datasets_for_each_modality)
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
# Calculate scores for each sub-task
|
| 111 |
+
for modality, datasets_list in DATASETS.items():
|
| 112 |
+
for sub_task, datasets in datasets_list.items():
|
| 113 |
+
sub_task_score = sum(all_scores.get(dataset, 0.0) for dataset in datasets)
|
| 114 |
+
avg_scores[sub_task] = get_avg(sub_task_score, len(datasets))
|
| 115 |
+
|
| 116 |
+
all_scores.update(avg_scores)
|
| 117 |
+
return all_scores
|
| 118 |
|
| 119 |
def generate_model_row(data):
|
| 120 |
metadata = data['metadata']
|
|
|
|
| 135 |
df = df.sort_values(by='Overall', ascending=False).reset_index(drop=True)
|
| 136 |
df['Rank'] = range(1, len(df) + 1)
|
| 137 |
df = create_hyperlinked_names(df)
|
|
|
|
| 138 |
return df
|
| 139 |
|
| 140 |
def refresh_data():
|