L3Score / app.py
Niklas Hoepner
Update app
4718ad2
import gradio as gr
import evaluate
l3score = evaluate.load("nhop/L3Score")
def compute_l3score(api_key, provider, model, questions, predictions, references):
try:
result = l3score.compute(
questions=[q.strip() for q in questions.split("\n") if q.strip()],
predictions=[p.strip() for p in predictions.split("\n") if p.strip()],
references=[r.strip() for r in references.split("\n") if r.strip()],
api_key=api_key,
provider=provider,
model=model
)
return result
except Exception as e:
return {"error": str(e)}
with gr.Blocks() as demo:
gr.Markdown(r"""
# Metric: L3Score
""")
with gr.Row():
api_key = gr.Textbox(label="API Key", type="password")
provider = gr.Dropdown(label="Provider", choices=["openai", "deepseek", "xai"], value="openai")
model = gr.Textbox(label="Model", value="gpt-4o-mini")
with gr.Row():
questions = gr.Textbox(label="Questions (one per line)", lines=4, placeholder="What is the capital of France?")
predictions = gr.Textbox(label="Predictions (one per line)", lines=4, placeholder="Paris")
references = gr.Textbox(label="References (one per line)", lines=4, placeholder="Paris")
compute_button = gr.Button("Compute L3Score")
output = gr.JSON(label="L3Score Result")
compute_button.click(
fn=compute_l3score,
inputs=[api_key, provider, model, questions, predictions, references],
outputs=output
)
gr.Markdown(r"""
## ๐Ÿ“Œ Description
**L3Score** evaluates how semantically close a model-generated answer is to a reference answer for a given question. It prompts a **language model as a judge** using:
```text
You are given a question, ground-truth answer, and a candidate answer.
Question: {{question}}
Ground-truth answer: {{gt}}
Candidate answer: {{answer}}
Is the semantic meaning of the ground-truth and candidate answers similar?
Answer in one word - Yes or No.
```
The model's **log-probabilities** for "Yes" and "No" tokens are used to compute the score.
---""")
gr.Markdown(""" ## ๐Ÿงฎ Scoring Logic""")
gr.Markdown(r"""Let $l_{\text{yes}}$ and $l_{\text{no}}$ be the log-probabilities of 'Yes' and 'No', respectively.""",latex_delimiters=[ {"left": "$", "right": "$", "display": False }])
gr.Markdown(r"""
- If neither token is in the top-5:
$$
\text{L3Score} = 0
$$
- If both are present:
$$
\text{L3Score} = \frac{\exp(l_{\text{yes}})}{\exp(l_{\text{yes}}) + \exp(l_{\text{no}})}
$$
- If only one is present, the missing tokenโ€™s probability is estimated using the minimum of:
- remaining probability mass apart from the top-5 tokens
- the least likely top-5 token
The score ranges from 0 to 1, where 1 indicates the highest confidence by the LLM that the predicted and reference answers are semantically equivalent.
See [SPIQA paper](https://arxiv.org/pdf/2407.09413) for details.
---
## ๐Ÿš€ How to Use
```python
import evaluate
l3score = evaluate.load("nhop/L3Score")
questions = ["What is the capital of France?", "What is the capital of Germany?"]
predictions = ["Paris", "Moscow"]
references = ["Paris", "Berlin"]
score = l3score.compute(
questions=questions,
predictions=predictions,
references=references,
api_key="your-openai-api-key",
provider="openai",
model="gpt-4o-mini"
)
print(score)
# {'L3Score': 0.49..., 'Cost': ...}
```
---
## ๐Ÿ”  Inputs
| Name | Type | Description |
|--------------|--------------|-----------------------------------------------------------------------------|
| `questions` | `list[str]` | The list of input questions. |
| `predictions`| `list[str]` | Generated answers by the model being evaluated. |
| `references` | `list[str]` | Ground-truth or reference answers. |
| `api_key` | `str` | API key for the selected LLM provider. |
| `provider` | `str` | Must support top-n token log-probabilities. **Default**: openai |
| `model` | `str` | Name of the evaluation LLM. **Default**: gpt-4o-mini |
## ๐Ÿ“„ Output
Calling the `compute` method returns a dictionary containing the L3Score:
```python
{"L3Score": float, "Cost": float}
```
The value is the **average score** over all (question, prediction, reference) triplets and the total cost of all API calls.
---
## ๐Ÿ“Š Examples
```python
l3score = evaluate.load("nhop/L3Score")
score = l3score.compute(
questions=["What is the capital of France?"],
predictions=["Paris"],
references=["Paris"],
api_key="your-openai-api-key",
provider="openai",
model="gpt-4o-mini"
)
# {'L3Score': 0.99..., 'Cost': ...}
score = l3score.compute(
questions=["What is the capital of Germany?"],
predictions=["Moscow"],
references=["Berlin"],
api_key="your-openai-api-key",
provider="openai",
model="gpt-4o-mini"
)
# {'L3Score': 0.00..., 'Cost': ...}
```
---
## โš ๏ธ Limitations and Bias
- Requires models that expose **top-n token log-probabilities** (e.g., OpenAI, DeepSeek, Groq).
- Scores are **only comparable when using the same judge model**.
## ๐Ÿ“– Citation
```bibtex
@article{pramanick2024spiqa,
title={SPIQA: A Dataset for Multimodal Question Answering on Scientific Papers},
author={Pramanick, Shraman and Chellappa, Rama and Venugopalan, Subhashini},
journal={arXiv preprint arXiv:2407.09413},
year={2024}
}
```
""")
demo.launch()