Spaces:

nhop
/

L3Score

Running

App Files Files Community

Niklas Hoepner commited on Apr 16

Commit

3adfe4c

1 Parent(s): 8f20777

Improved Gradio Application

Browse files

Files changed (2) hide show

README.md +1 -1
app.py +29 -21

README.md CHANGED Viewed

@@ -19,7 +19,7 @@ app_file: app.py
 pinned: false
 ---
-# 🦢 Metric Card: L3Score
 ## 📌 Description

 pinned: false
 ---
+# Metric Card: L3Score
 ## 📌 Description

app.py CHANGED Viewed

@@ -18,6 +18,27 @@ def compute_l3score(api_key, provider, model, questions, predictions, references
         return {"error": str(e)}
 with gr.Blocks() as demo:
     gr.Markdown(r"""
     # 🦢 L3Score Evaluation Demo
@@ -55,7 +76,9 @@ with gr.Blocks() as demo:
     \text{L3Score} = \frac{\exp(l_{\text{yes}})}{\exp(l_{\text{yes}}) + \exp(l_{\text{no}})}
     $$
-    - If only one is present, the missing token’s probability is estimated using remaining mass or the least likely top-5 token.
     ---
@@ -87,10 +110,13 @@ with gr.Blocks() as demo:
     | `predictions`| `list[str]`  | Generated answers by the model being evaluated.                            |
     | `references` | `list[str]`  | Ground-truth or reference answers.                                         |
     | `api_key`    | `str`        | API key for the selected LLM provider.                                     |
-    | `provider`   | `str`        | Must support top-n token log-probabilities.                                |
-    | `model`      | `str`        | Name of the evaluation LLM.                                                |
     ## 📄 Output
     ```python
     {"L3Score": float}
     ```
@@ -113,23 +139,5 @@ with gr.Blocks() as demo:
     ```
     """)
-    with gr.Row():
-        api_key = gr.Textbox(label="API Key", type="password")
-        provider = gr.Dropdown(label="Provider", choices=["openai", "deepseek", "xai"], value="openai")
-        model = gr.Textbox(label="Model", value="gpt-4o-mini")
-    with gr.Row():
-        questions = gr.Textbox(label="Questions (one per line)", lines=4, placeholder="What is the capital of France?")
-        predictions = gr.Textbox(label="Predictions (one per line)", lines=4, placeholder="Paris")
-        references = gr.Textbox(label="References (one per line)", lines=4, placeholder="Paris")
-    compute_button = gr.Button("Compute L3Score")
-    output = gr.JSON(label="L3Score Result")
-    compute_button.click(
-        fn=compute_l3score,
-        inputs=[api_key, provider, model, questions, predictions, references],
-        outputs=output
-    )
 demo.launch()

         return {"error": str(e)}
 with gr.Blocks() as demo:
+    with gr.Row():
+        api_key = gr.Textbox(label="API Key", type="password")
+        provider = gr.Dropdown(label="Provider", choices=["openai", "deepseek", "xai"], value="openai")
+        model = gr.Textbox(label="Model", value="gpt-4o-mini")
+    with gr.Row():
+        questions = gr.Textbox(label="Questions (one per line)", lines=4, placeholder="What is the capital of France?")
+        predictions = gr.Textbox(label="Predictions (one per line)", lines=4, placeholder="Paris")
+        references = gr.Textbox(label="References (one per line)", lines=4, placeholder="Paris")
+    compute_button = gr.Button("Compute L3Score")
+    output = gr.JSON(label="L3Score Result")
+    compute_button.click(
+        fn=compute_l3score,
+        inputs=[api_key, provider, model, questions, predictions, references],
+        outputs=output
+    )
     gr.Markdown(r"""
     # 🦢 L3Score Evaluation Demo
     \text{L3Score} = \frac{\exp(l_{\text{yes}})}{\exp(l_{\text{yes}}) + \exp(l_{\text{no}})}
     $$
+    - If only one is present, the missing token’s probability is estimated using the minimum of:
+        - remaining probability mass apart from the top-5 tokens
+        - the least likely top-5 token
     ---
     | `predictions`| `list[str]`  | Generated answers by the model being evaluated.                            |
     | `references` | `list[str]`  | Ground-truth or reference answers.                                         |
     | `api_key`    | `str`        | API key for the selected LLM provider.                                     |
+    | `provider`   | `str`        | Must support top-n token log-probabilities. **Default**: openai                               |
+    | `model`      | `str`        | Name of the evaluation LLM. **Default**: gpt-4o-mini                                               |
     ## 📄 Output
+    Calling the `compute` method returns a dictionary containing the L3Score:
     ```python
     {"L3Score": float}
     ```
     ```
     """)
 demo.launch()