saattrupdan commited on
Commit
ada1f6c
·
1 Parent(s): 4bf4abc

feat: Use experimental nested t-test to determine statistical significance

Browse files
Files changed (1) hide show
  1. app.py +50 -6
app.py CHANGED
@@ -555,18 +555,19 @@ def produce_radial_plot(
555
  for language in languages:
556
  if model_id not in results_dfs_filtered[language].index:
557
  continue
 
558
  score_list = results_dfs_filtered[language].loc[model_id][task]
 
 
 
559
  win_ratio = 100 * np.mean([
560
- stats.ttest_rel(
561
- a=score_list, b=other_scores, alternative="greater"
562
- ).pvalue < 0.05
563
  for other_scores in results_dfs_filtered[language][task].dropna().drop(index=model_id)
564
  ])
565
  win_ratios.append(win_ratio)
566
 
567
- if all(score < 1 for score in score_list):
568
- score_list = [100 * score for score in score_list]
569
-
570
  scores.append(np.mean(score_list))
571
  if use_win_ratio:
572
  result_list.append(np.mean(win_ratios))
@@ -687,5 +688,48 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
687
 
688
  return results_dfs
689
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
690
  if __name__ == "__main__":
691
  main()
 
555
  for language in languages:
556
  if model_id not in results_dfs_filtered[language].index:
557
  continue
558
+
559
  score_list = results_dfs_filtered[language].loc[model_id][task]
560
+ if all(score < 1 for score in score_list):
561
+ score_list = [100 * score for score in score_list]
562
+
563
  win_ratio = 100 * np.mean([
564
+ scores_statistically_better(
565
+ score_values_1=score_list, score_values_2=other_scores
566
+ )
567
  for other_scores in results_dfs_filtered[language][task].dropna().drop(index=model_id)
568
  ])
569
  win_ratios.append(win_ratio)
570
 
 
 
 
571
  scores.append(np.mean(score_list))
572
  if use_win_ratio:
573
  result_list.append(np.mean(win_ratios))
 
688
 
689
  return results_dfs
690
 
691
+
692
+ def scores_statistically_better(
693
+ score_values_1: list[float], score_values_2: list[float]
694
+ ) -> bool:
695
+ """Determine whether the first score group is statistically better than the second.
696
+
697
+ Args:
698
+ score_values_1:
699
+ The scores for the first group.
700
+ score_values_2:
701
+ The scores for the second group.
702
+
703
+ Returns:
704
+ Whether the first score group is statistically better than the second.
705
+ """
706
+ assert len(score_values_1) == len(score_values_2), (
707
+ "The two score groups must have the same length."
708
+ )
709
+
710
+ # Separate the scores into groups of 10, consisting of the scores for each
711
+ # dataset
712
+ group_scores_1 = [
713
+ score_values_1[idx:idx+10] for idx in range(0, len(score_values_1), 10)
714
+ ]
715
+ group_scores_2 = [
716
+ score_values_2[idx:idx+10] for idx in range(0, len(score_values_2), 10)
717
+ ]
718
+
719
+ # Compute t-statistics for each group separately, and compute the mean
720
+ # t-statistic
721
+ t_statistics = [
722
+ stats.ttest_ind(a=group_1, b=group_2, alternative="greater").statistic
723
+ for group_1, group_2 in zip(group_scores_1, group_scores_2)
724
+ ]
725
+ mean_t_statistic = np.mean(t_statistics)
726
+
727
+ # Compute the p-value for the mean t-statistic, where the null hypothesis is
728
+ # that the first group does not have a larger mean score than the second group
729
+ degrees_of_freedom = len(score_values_1) - 1
730
+ p_value = 1 - stats.t.cdf(abs(mean_t_statistic), degrees_of_freedom)
731
+
732
+ return p_value < 0.05
733
+
734
  if __name__ == "__main__":
735
  main()