Niklas Hoepner commited on
Commit
3adfe4c
·
1 Parent(s): 8f20777

Improved Gradio Application

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +29 -21
README.md CHANGED
@@ -19,7 +19,7 @@ app_file: app.py
19
  pinned: false
20
  ---
21
 
22
- # 🦢 Metric Card: L3Score
23
 
24
  ## 📌 Description
25
 
 
19
  pinned: false
20
  ---
21
 
22
+ # Metric Card: L3Score
23
 
24
  ## 📌 Description
25
 
app.py CHANGED
@@ -18,6 +18,27 @@ def compute_l3score(api_key, provider, model, questions, predictions, references
18
  return {"error": str(e)}
19
 
20
  with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  gr.Markdown(r"""
22
  # 🦢 L3Score Evaluation Demo
23
 
@@ -55,7 +76,9 @@ with gr.Blocks() as demo:
55
  \text{L3Score} = \frac{\exp(l_{\text{yes}})}{\exp(l_{\text{yes}}) + \exp(l_{\text{no}})}
56
  $$
57
 
58
- - If only one is present, the missing token’s probability is estimated using remaining mass or the least likely top-5 token.
 
 
59
 
60
  ---
61
 
@@ -87,10 +110,13 @@ with gr.Blocks() as demo:
87
  | `predictions`| `list[str]` | Generated answers by the model being evaluated. |
88
  | `references` | `list[str]` | Ground-truth or reference answers. |
89
  | `api_key` | `str` | API key for the selected LLM provider. |
90
- | `provider` | `str` | Must support top-n token log-probabilities. |
91
- | `model` | `str` | Name of the evaluation LLM. |
92
 
93
  ## 📄 Output
 
 
 
94
  ```python
95
  {"L3Score": float}
96
  ```
@@ -113,23 +139,5 @@ with gr.Blocks() as demo:
113
  ```
114
  """)
115
 
116
- with gr.Row():
117
- api_key = gr.Textbox(label="API Key", type="password")
118
- provider = gr.Dropdown(label="Provider", choices=["openai", "deepseek", "xai"], value="openai")
119
- model = gr.Textbox(label="Model", value="gpt-4o-mini")
120
-
121
- with gr.Row():
122
- questions = gr.Textbox(label="Questions (one per line)", lines=4, placeholder="What is the capital of France?")
123
- predictions = gr.Textbox(label="Predictions (one per line)", lines=4, placeholder="Paris")
124
- references = gr.Textbox(label="References (one per line)", lines=4, placeholder="Paris")
125
-
126
- compute_button = gr.Button("Compute L3Score")
127
- output = gr.JSON(label="L3Score Result")
128
-
129
- compute_button.click(
130
- fn=compute_l3score,
131
- inputs=[api_key, provider, model, questions, predictions, references],
132
- outputs=output
133
- )
134
 
135
  demo.launch()
 
18
  return {"error": str(e)}
19
 
20
  with gr.Blocks() as demo:
21
+
22
+
23
+ with gr.Row():
24
+ api_key = gr.Textbox(label="API Key", type="password")
25
+ provider = gr.Dropdown(label="Provider", choices=["openai", "deepseek", "xai"], value="openai")
26
+ model = gr.Textbox(label="Model", value="gpt-4o-mini")
27
+
28
+ with gr.Row():
29
+ questions = gr.Textbox(label="Questions (one per line)", lines=4, placeholder="What is the capital of France?")
30
+ predictions = gr.Textbox(label="Predictions (one per line)", lines=4, placeholder="Paris")
31
+ references = gr.Textbox(label="References (one per line)", lines=4, placeholder="Paris")
32
+
33
+ compute_button = gr.Button("Compute L3Score")
34
+ output = gr.JSON(label="L3Score Result")
35
+
36
+ compute_button.click(
37
+ fn=compute_l3score,
38
+ inputs=[api_key, provider, model, questions, predictions, references],
39
+ outputs=output
40
+ )
41
+
42
  gr.Markdown(r"""
43
  # 🦢 L3Score Evaluation Demo
44
 
 
76
  \text{L3Score} = \frac{\exp(l_{\text{yes}})}{\exp(l_{\text{yes}}) + \exp(l_{\text{no}})}
77
  $$
78
 
79
+ - If only one is present, the missing token’s probability is estimated using the minimum of:
80
+ - remaining probability mass apart from the top-5 tokens
81
+ - the least likely top-5 token
82
 
83
  ---
84
 
 
110
  | `predictions`| `list[str]` | Generated answers by the model being evaluated. |
111
  | `references` | `list[str]` | Ground-truth or reference answers. |
112
  | `api_key` | `str` | API key for the selected LLM provider. |
113
+ | `provider` | `str` | Must support top-n token log-probabilities. **Default**: openai |
114
+ | `model` | `str` | Name of the evaluation LLM. **Default**: gpt-4o-mini |
115
 
116
  ## 📄 Output
117
+
118
+ Calling the `compute` method returns a dictionary containing the L3Score:
119
+
120
  ```python
121
  {"L3Score": float}
122
  ```
 
139
  ```
140
  """)
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  demo.launch()