saattrupdan commited on
Commit
c97530c
·
1 Parent(s): 6bdb37f

feat: Use actual ranks on scale

Browse files
Files changed (1) hide show
  1. app.py +16 -8
app.py CHANGED
@@ -126,7 +126,7 @@ paper](https://aclanthology.org/2023.nodalida-1.20):
126
  UPDATE_FREQUENCY_MINUTES = 5
127
  MIN_COLOUR_DISTANCE_BETWEEN_MODELS = 200
128
  DEFAULT_LANGUAGES = ["Danish"]
129
- DEFAULT_MODELS = ["gpt-4-0613", "mistralai/Mistral-7B-v0.1"]
130
 
131
 
132
  class Task(BaseModel):
@@ -633,6 +633,7 @@ def produce_radial_plot(
633
  for task in tasks:
634
  for language in languages:
635
  df = results_dfs_filtered[language][task].dropna()
 
636
  model_ids_sorted: list[str] = (
637
  df.map(np.mean).sort_values(ascending=False).index.tolist()
638
  )
@@ -649,14 +650,15 @@ def produce_radial_plot(
649
  a=best_scores, b=scores, alternative="greater"
650
  ).pvalue < 0.05
651
  if worse_than_previous_models:
652
- rank += 1
 
 
653
  best_scores = scores
654
  ranks.append(rank)
655
 
656
  ranks = np.asarray(ranks)
657
- scores = 1 - (ranks / ranks.max())
658
- for model_id, score in zip(model_ids_sorted, scores):
659
- all_rank_scores[task][language][model_id] = score
660
  logger.info("Successfully computed rank scores.")
661
 
662
  # Add all the evaluation results for each model
@@ -671,7 +673,7 @@ def produce_radial_plot(
671
  if model_id not in results_dfs_filtered[language].index:
672
  continue
673
 
674
- rank_score = 100 * all_rank_scores[task][language][model_id]
675
  rank_scores.append(rank_score)
676
 
677
  score_arr = np.array(results_dfs_filtered[language].loc[model_id][task])
@@ -699,7 +701,9 @@ def produce_radial_plot(
699
  # Sort the models (and their results) such that the model who beats most other
700
  # models first. This will result in the "smaller areas" being on top of the "larger
701
  # areas", which is more aesthetically pleasing.
702
- sorted_idxs = num_models_beaten.sum(axis=1).argsort()[::-1]
 
 
703
  model_ids = np.asarray(model_ids)[sorted_idxs].tolist()
704
  results = result_matrix[sorted_idxs].tolist()
705
 
@@ -734,7 +738,11 @@ def produce_radial_plot(
734
 
735
  # Builds the radial plot from the results
736
  fig.update_layout(
737
- polar=dict(radialaxis=dict(visible=show_scale, range=[0, 100])),
 
 
 
 
738
  showlegend=True,
739
  title=title,
740
  width=plot_width,
 
126
  UPDATE_FREQUENCY_MINUTES = 5
127
  MIN_COLOUR_DISTANCE_BETWEEN_MODELS = 200
128
  DEFAULT_LANGUAGES = ["Danish"]
129
+ DEFAULT_MODELS = ["gpt-4-0613", "google/gemma-3-12b-it"]
130
 
131
 
132
  class Task(BaseModel):
 
633
  for task in tasks:
634
  for language in languages:
635
  df = results_dfs_filtered[language][task].dropna()
636
+ stddev = df.map(np.mean).std()
637
  model_ids_sorted: list[str] = (
638
  df.map(np.mean).sort_values(ascending=False).index.tolist()
639
  )
 
650
  a=best_scores, b=scores, alternative="greater"
651
  ).pvalue < 0.05
652
  if worse_than_previous_models:
653
+ difference = np.mean(best_scores) - np.mean(scores)
654
+ normalised_difference = difference / stddev
655
+ rank += normalised_difference
656
  best_scores = scores
657
  ranks.append(rank)
658
 
659
  ranks = np.asarray(ranks)
660
+ for model_id, rank in zip(model_ids_sorted, ranks):
661
+ all_rank_scores[task][language][model_id] = rank
 
662
  logger.info("Successfully computed rank scores.")
663
 
664
  # Add all the evaluation results for each model
 
673
  if model_id not in results_dfs_filtered[language].index:
674
  continue
675
 
676
+ rank_score = all_rank_scores[task][language][model_id]
677
  rank_scores.append(rank_score)
678
 
679
  score_arr = np.array(results_dfs_filtered[language].loc[model_id][task])
 
701
  # Sort the models (and their results) such that the model who beats most other
702
  # models first. This will result in the "smaller areas" being on top of the "larger
703
  # areas", which is more aesthetically pleasing.
704
+ sorted_idxs = num_models_beaten.sum(axis=1).argsort()
705
+ if not use_rank_score:
706
+ sorted_idxs = sorted_idxs[::-1]
707
  model_ids = np.asarray(model_ids)[sorted_idxs].tolist()
708
  results = result_matrix[sorted_idxs].tolist()
709
 
 
738
 
739
  # Builds the radial plot from the results
740
  fig.update_layout(
741
+ polar=dict(
742
+ radialaxis=dict(
743
+ visible=show_scale, range=[5, 1] if use_rank_score else [0, 100]
744
+ ),
745
+ ),
746
  showlegend=True,
747
  title=title,
748
  width=plot_width,