Commit
·
c97530c
1
Parent(s):
6bdb37f
feat: Use actual ranks on scale
Browse files
app.py
CHANGED
@@ -126,7 +126,7 @@ paper](https://aclanthology.org/2023.nodalida-1.20):
|
|
126 |
UPDATE_FREQUENCY_MINUTES = 5
|
127 |
MIN_COLOUR_DISTANCE_BETWEEN_MODELS = 200
|
128 |
DEFAULT_LANGUAGES = ["Danish"]
|
129 |
-
DEFAULT_MODELS = ["gpt-4-0613", "
|
130 |
|
131 |
|
132 |
class Task(BaseModel):
|
@@ -633,6 +633,7 @@ def produce_radial_plot(
|
|
633 |
for task in tasks:
|
634 |
for language in languages:
|
635 |
df = results_dfs_filtered[language][task].dropna()
|
|
|
636 |
model_ids_sorted: list[str] = (
|
637 |
df.map(np.mean).sort_values(ascending=False).index.tolist()
|
638 |
)
|
@@ -649,14 +650,15 @@ def produce_radial_plot(
|
|
649 |
a=best_scores, b=scores, alternative="greater"
|
650 |
).pvalue < 0.05
|
651 |
if worse_than_previous_models:
|
652 |
-
|
|
|
|
|
653 |
best_scores = scores
|
654 |
ranks.append(rank)
|
655 |
|
656 |
ranks = np.asarray(ranks)
|
657 |
-
|
658 |
-
|
659 |
-
all_rank_scores[task][language][model_id] = score
|
660 |
logger.info("Successfully computed rank scores.")
|
661 |
|
662 |
# Add all the evaluation results for each model
|
@@ -671,7 +673,7 @@ def produce_radial_plot(
|
|
671 |
if model_id not in results_dfs_filtered[language].index:
|
672 |
continue
|
673 |
|
674 |
-
rank_score =
|
675 |
rank_scores.append(rank_score)
|
676 |
|
677 |
score_arr = np.array(results_dfs_filtered[language].loc[model_id][task])
|
@@ -699,7 +701,9 @@ def produce_radial_plot(
|
|
699 |
# Sort the models (and their results) such that the model who beats most other
|
700 |
# models first. This will result in the "smaller areas" being on top of the "larger
|
701 |
# areas", which is more aesthetically pleasing.
|
702 |
-
sorted_idxs = num_models_beaten.sum(axis=1).argsort()
|
|
|
|
|
703 |
model_ids = np.asarray(model_ids)[sorted_idxs].tolist()
|
704 |
results = result_matrix[sorted_idxs].tolist()
|
705 |
|
@@ -734,7 +738,11 @@ def produce_radial_plot(
|
|
734 |
|
735 |
# Builds the radial plot from the results
|
736 |
fig.update_layout(
|
737 |
-
polar=dict(
|
|
|
|
|
|
|
|
|
738 |
showlegend=True,
|
739 |
title=title,
|
740 |
width=plot_width,
|
|
|
126 |
UPDATE_FREQUENCY_MINUTES = 5
|
127 |
MIN_COLOUR_DISTANCE_BETWEEN_MODELS = 200
|
128 |
DEFAULT_LANGUAGES = ["Danish"]
|
129 |
+
DEFAULT_MODELS = ["gpt-4-0613", "google/gemma-3-12b-it"]
|
130 |
|
131 |
|
132 |
class Task(BaseModel):
|
|
|
633 |
for task in tasks:
|
634 |
for language in languages:
|
635 |
df = results_dfs_filtered[language][task].dropna()
|
636 |
+
stddev = df.map(np.mean).std()
|
637 |
model_ids_sorted: list[str] = (
|
638 |
df.map(np.mean).sort_values(ascending=False).index.tolist()
|
639 |
)
|
|
|
650 |
a=best_scores, b=scores, alternative="greater"
|
651 |
).pvalue < 0.05
|
652 |
if worse_than_previous_models:
|
653 |
+
difference = np.mean(best_scores) - np.mean(scores)
|
654 |
+
normalised_difference = difference / stddev
|
655 |
+
rank += normalised_difference
|
656 |
best_scores = scores
|
657 |
ranks.append(rank)
|
658 |
|
659 |
ranks = np.asarray(ranks)
|
660 |
+
for model_id, rank in zip(model_ids_sorted, ranks):
|
661 |
+
all_rank_scores[task][language][model_id] = rank
|
|
|
662 |
logger.info("Successfully computed rank scores.")
|
663 |
|
664 |
# Add all the evaluation results for each model
|
|
|
673 |
if model_id not in results_dfs_filtered[language].index:
|
674 |
continue
|
675 |
|
676 |
+
rank_score = all_rank_scores[task][language][model_id]
|
677 |
rank_scores.append(rank_score)
|
678 |
|
679 |
score_arr = np.array(results_dfs_filtered[language].loc[model_id][task])
|
|
|
701 |
# Sort the models (and their results) such that the model who beats most other
|
702 |
# models first. This will result in the "smaller areas" being on top of the "larger
|
703 |
# areas", which is more aesthetically pleasing.
|
704 |
+
sorted_idxs = num_models_beaten.sum(axis=1).argsort()
|
705 |
+
if not use_rank_score:
|
706 |
+
sorted_idxs = sorted_idxs[::-1]
|
707 |
model_ids = np.asarray(model_ids)[sorted_idxs].tolist()
|
708 |
results = result_matrix[sorted_idxs].tolist()
|
709 |
|
|
|
738 |
|
739 |
# Builds the radial plot from the results
|
740 |
fig.update_layout(
|
741 |
+
polar=dict(
|
742 |
+
radialaxis=dict(
|
743 |
+
visible=show_scale, range=[5, 1] if use_rank_score else [0, 100]
|
744 |
+
),
|
745 |
+
),
|
746 |
showlegend=True,
|
747 |
title=title,
|
748 |
width=plot_width,
|