Joschka Strueber commited on
Commit
bd1b20b
·
1 Parent(s): 0d09d9a

[Ref] back to markdown

Browse files
Files changed (1) hide show
  1. app.py +8 -19
app.py CHANGED
@@ -78,27 +78,16 @@ with gr.Blocks(title="LLM Similarity Analyzer", css=app_util.custom_css) as demo
78
  )
79
 
80
  gr.Markdown("## Information")
81
- metric_info_html = r"""
82
- <!-- Include KaTeX CSS for styling -->
83
- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-vZTGXXFDvM1R7zDKx2g5N5S4FcoFdTJuFTz1Xj2A2/J1j4fGmS7a6hLQ6ZPfF1sk" crossorigin="anonymous">
84
- <!-- Include KaTeX and its auto-render extension -->
85
- <script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js" integrity="sha384-6R6ckgSpF6yXUHg9+KJGXN9I+ik5U9dviDuzhSxrtk4AUaGr8/8Qovm6N9fl/hkz" crossorigin="anonymous"></script>
86
- <script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/auto-render.min.js" integrity="sha384-mll67QQ8ErU7t8/QqU3m0Cq56E7i2xUeFYSv6O9V3CRjNdqPzqxK9z6gS9GQFj8D" crossorigin="anonymous"
87
- onload="renderMathInElement(document.body);"></script>
88
 
89
- <div>
90
- <p>
91
- We propose Chance Adjusted Probabilistic Agreement ($\operatorname{CAPA}$, or $\kappa_p$), a novel metric
92
- for model similarity which adjusts for chance agreement due to accuracy. Using CAPA, we find:
93
- </p>
94
- <ol>
95
- <li>LLM-as-a-judge scores are biased towards more similar models controlling for the model's capability.</li>
96
- <li>Gain from training strong models on annotations of weak supervisors (weak-to-strong generalization) is higher when the two models are more different.</li>
97
- <li>Concerningly, model errors are getting more correlated as capabilities increase.</li>
98
- </ol>
99
- </div>
100
  """
101
- gr.HTML(value=metric_info_html)
102
  with gr.Row():
103
  gr.Image(value="data/table_capa.png", label="Comparison of different similarity metrics for multiple-choice questions", elem_classes="image_container", interactive=False)
104
  gr.Markdown("""
 
78
  )
79
 
80
  gr.Markdown("## Information")
81
+ metric_info_markdown = r"""
82
+ We propose Chance Adjusted Probabilistic Agreement (\(\operatorname{CAPA}\), or \(\kappa_p\)), a novel metric for model similarity which adjusts for chance agreement due to accuracy.
 
 
 
 
 
83
 
84
+ Using CAPA, we find:
85
+
86
+ 1. LLM-as-a-judge scores are biased towards more similar models controlling for the model's capability.
87
+ 2. Gain from training strong models on annotations of weak supervisors (weak-to-strong generalization) is higher when the two models are more different.
88
+ 3. Concerningly, model errors are getting more correlated as capabilities increase.
 
 
 
 
 
 
89
  """
90
+ gr.Markdown(metric_info_markdown)
91
  with gr.Row():
92
  gr.Image(value="data/table_capa.png", label="Comparison of different similarity metrics for multiple-choice questions", elem_classes="image_container", interactive=False)
93
  gr.Markdown("""