Commit
·
ada1f6c
1
Parent(s):
4bf4abc
feat: Use experimental nested t-test to determine statistical significance
Browse files
app.py
CHANGED
@@ -555,18 +555,19 @@ def produce_radial_plot(
|
|
555 |
for language in languages:
|
556 |
if model_id not in results_dfs_filtered[language].index:
|
557 |
continue
|
|
|
558 |
score_list = results_dfs_filtered[language].loc[model_id][task]
|
|
|
|
|
|
|
559 |
win_ratio = 100 * np.mean([
|
560 |
-
|
561 |
-
|
562 |
-
)
|
563 |
for other_scores in results_dfs_filtered[language][task].dropna().drop(index=model_id)
|
564 |
])
|
565 |
win_ratios.append(win_ratio)
|
566 |
|
567 |
-
if all(score < 1 for score in score_list):
|
568 |
-
score_list = [100 * score for score in score_list]
|
569 |
-
|
570 |
scores.append(np.mean(score_list))
|
571 |
if use_win_ratio:
|
572 |
result_list.append(np.mean(win_ratios))
|
@@ -687,5 +688,48 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
|
|
687 |
|
688 |
return results_dfs
|
689 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
690 |
if __name__ == "__main__":
|
691 |
main()
|
|
|
555 |
for language in languages:
|
556 |
if model_id not in results_dfs_filtered[language].index:
|
557 |
continue
|
558 |
+
|
559 |
score_list = results_dfs_filtered[language].loc[model_id][task]
|
560 |
+
if all(score < 1 for score in score_list):
|
561 |
+
score_list = [100 * score for score in score_list]
|
562 |
+
|
563 |
win_ratio = 100 * np.mean([
|
564 |
+
scores_statistically_better(
|
565 |
+
score_values_1=score_list, score_values_2=other_scores
|
566 |
+
)
|
567 |
for other_scores in results_dfs_filtered[language][task].dropna().drop(index=model_id)
|
568 |
])
|
569 |
win_ratios.append(win_ratio)
|
570 |
|
|
|
|
|
|
|
571 |
scores.append(np.mean(score_list))
|
572 |
if use_win_ratio:
|
573 |
result_list.append(np.mean(win_ratios))
|
|
|
688 |
|
689 |
return results_dfs
|
690 |
|
691 |
+
|
692 |
+
def scores_statistically_better(
|
693 |
+
score_values_1: list[float], score_values_2: list[float]
|
694 |
+
) -> bool:
|
695 |
+
"""Determine whether the first score group is statistically better than the second.
|
696 |
+
|
697 |
+
Args:
|
698 |
+
score_values_1:
|
699 |
+
The scores for the first group.
|
700 |
+
score_values_2:
|
701 |
+
The scores for the second group.
|
702 |
+
|
703 |
+
Returns:
|
704 |
+
Whether the first score group is statistically better than the second.
|
705 |
+
"""
|
706 |
+
assert len(score_values_1) == len(score_values_2), (
|
707 |
+
"The two score groups must have the same length."
|
708 |
+
)
|
709 |
+
|
710 |
+
# Separate the scores into groups of 10, consisting of the scores for each
|
711 |
+
# dataset
|
712 |
+
group_scores_1 = [
|
713 |
+
score_values_1[idx:idx+10] for idx in range(0, len(score_values_1), 10)
|
714 |
+
]
|
715 |
+
group_scores_2 = [
|
716 |
+
score_values_2[idx:idx+10] for idx in range(0, len(score_values_2), 10)
|
717 |
+
]
|
718 |
+
|
719 |
+
# Compute t-statistics for each group separately, and compute the mean
|
720 |
+
# t-statistic
|
721 |
+
t_statistics = [
|
722 |
+
stats.ttest_ind(a=group_1, b=group_2, alternative="greater").statistic
|
723 |
+
for group_1, group_2 in zip(group_scores_1, group_scores_2)
|
724 |
+
]
|
725 |
+
mean_t_statistic = np.mean(t_statistics)
|
726 |
+
|
727 |
+
# Compute the p-value for the mean t-statistic, where the null hypothesis is
|
728 |
+
# that the first group does not have a larger mean score than the second group
|
729 |
+
degrees_of_freedom = len(score_values_1) - 1
|
730 |
+
p_value = 1 - stats.t.cdf(abs(mean_t_statistic), degrees_of_freedom)
|
731 |
+
|
732 |
+
return p_value < 0.05
|
733 |
+
|
734 |
if __name__ == "__main__":
|
735 |
main()
|