|
import gradio as gr |
|
import evaluate |
|
|
|
|
|
l3score = evaluate.load("nhop/L3Score") |
|
|
|
def compute_l3score(api_key, provider, model, questions, predictions, references): |
|
try: |
|
result = l3score.compute( |
|
questions=[q.strip() for q in questions.split("\n") if q.strip()], |
|
predictions=[p.strip() for p in predictions.split("\n") if p.strip()], |
|
references=[r.strip() for r in references.split("\n") if r.strip()], |
|
api_key=api_key, |
|
provider=provider, |
|
model=model |
|
) |
|
return result |
|
except Exception as e: |
|
return {"error": str(e)} |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown(r""" |
|
# Metric: L3Score |
|
""") |
|
|
|
|
|
with gr.Row(): |
|
api_key = gr.Textbox(label="API Key", type="password") |
|
provider = gr.Dropdown(label="Provider", choices=["openai", "deepseek", "xai"], value="openai") |
|
model = gr.Textbox(label="Model", value="gpt-4o-mini") |
|
|
|
with gr.Row(): |
|
questions = gr.Textbox(label="Questions (one per line)", lines=4, placeholder="What is the capital of France?") |
|
predictions = gr.Textbox(label="Predictions (one per line)", lines=4, placeholder="Paris") |
|
references = gr.Textbox(label="References (one per line)", lines=4, placeholder="Paris") |
|
|
|
compute_button = gr.Button("Compute L3Score") |
|
output = gr.JSON(label="L3Score Result") |
|
|
|
compute_button.click( |
|
fn=compute_l3score, |
|
inputs=[api_key, provider, model, questions, predictions, references], |
|
outputs=output |
|
) |
|
|
|
gr.Markdown(r""" |
|
|
|
## ๐ Description |
|
**L3Score** evaluates how semantically close a model-generated answer is to a reference answer for a given question. It prompts a **language model as a judge** using: |
|
|
|
```text |
|
You are given a question, ground-truth answer, and a candidate answer. |
|
|
|
Question: {{question}} |
|
Ground-truth answer: {{gt}} |
|
Candidate answer: {{answer}} |
|
|
|
Is the semantic meaning of the ground-truth and candidate answers similar? |
|
Answer in one word - Yes or No. |
|
``` |
|
|
|
The model's **log-probabilities** for "Yes" and "No" tokens are used to compute the score. |
|
|
|
---""") |
|
|
|
gr.Markdown(""" ## ๐งฎ Scoring Logic""") |
|
gr.Markdown(r"""Let $l_{\text{yes}}$ and $l_{\text{no}}$ be the log-probabilities of 'Yes' and 'No', respectively.""",latex_delimiters=[ {"left": "$", "right": "$", "display": False }]) |
|
gr.Markdown(r""" |
|
- If neither token is in the top-5: |
|
|
|
$$ |
|
\text{L3Score} = 0 |
|
$$ |
|
|
|
- If both are present: |
|
|
|
$$ |
|
\text{L3Score} = \frac{\exp(l_{\text{yes}})}{\exp(l_{\text{yes}}) + \exp(l_{\text{no}})} |
|
$$ |
|
|
|
- If only one is present, the missing tokenโs probability is estimated using the minimum of: |
|
- remaining probability mass apart from the top-5 tokens |
|
- the least likely top-5 token |
|
|
|
The score ranges from 0 to 1, where 1 indicates the highest confidence by the LLM that the predicted and reference answers are semantically equivalent. |
|
See [SPIQA paper](https://arxiv.org/pdf/2407.09413) for details. |
|
|
|
--- |
|
|
|
## ๐ How to Use |
|
|
|
```python |
|
import evaluate |
|
|
|
l3score = evaluate.load("nhop/L3Score") |
|
|
|
questions = ["What is the capital of France?", "What is the capital of Germany?"] |
|
predictions = ["Paris", "Moscow"] |
|
references = ["Paris", "Berlin"] |
|
|
|
score = l3score.compute( |
|
questions=questions, |
|
predictions=predictions, |
|
references=references, |
|
api_key="your-openai-api-key", |
|
provider="openai", |
|
model="gpt-4o-mini" |
|
) |
|
|
|
print(score) |
|
# {'L3Score': 0.49..., 'Cost': ...} |
|
``` |
|
|
|
--- |
|
|
|
## ๐ Inputs |
|
| Name | Type | Description | |
|
|--------------|--------------|-----------------------------------------------------------------------------| |
|
| `questions` | `list[str]` | The list of input questions. | |
|
| `predictions`| `list[str]` | Generated answers by the model being evaluated. | |
|
| `references` | `list[str]` | Ground-truth or reference answers. | |
|
| `api_key` | `str` | API key for the selected LLM provider. | |
|
| `provider` | `str` | Must support top-n token log-probabilities. **Default**: openai | |
|
| `model` | `str` | Name of the evaluation LLM. **Default**: gpt-4o-mini | |
|
|
|
## ๐ Output |
|
|
|
Calling the `compute` method returns a dictionary containing the L3Score: |
|
|
|
```python |
|
{"L3Score": float, "Cost": float} |
|
``` |
|
The value is the **average score** over all (question, prediction, reference) triplets and the total cost of all API calls. |
|
|
|
--- |
|
|
|
## ๐ Examples |
|
|
|
```python |
|
l3score = evaluate.load("nhop/L3Score") |
|
|
|
score = l3score.compute( |
|
questions=["What is the capital of France?"], |
|
predictions=["Paris"], |
|
references=["Paris"], |
|
api_key="your-openai-api-key", |
|
provider="openai", |
|
model="gpt-4o-mini" |
|
) |
|
# {'L3Score': 0.99..., 'Cost': ...} |
|
|
|
score = l3score.compute( |
|
questions=["What is the capital of Germany?"], |
|
predictions=["Moscow"], |
|
references=["Berlin"], |
|
api_key="your-openai-api-key", |
|
provider="openai", |
|
model="gpt-4o-mini" |
|
) |
|
# {'L3Score': 0.00..., 'Cost': ...} |
|
``` |
|
--- |
|
|
|
## โ ๏ธ Limitations and Bias |
|
- Requires models that expose **top-n token log-probabilities** (e.g., OpenAI, DeepSeek, Groq). |
|
- Scores are **only comparable when using the same judge model**. |
|
|
|
|
|
## ๐ Citation |
|
```bibtex |
|
@article{pramanick2024spiqa, |
|
title={SPIQA: A Dataset for Multimodal Question Answering on Scientific Papers}, |
|
author={Pramanick, Shraman and Chellappa, Rama and Venugopalan, Subhashini}, |
|
journal={arXiv preprint arXiv:2407.09413}, |
|
year={2024} |
|
} |
|
``` |
|
""") |
|
|
|
|
|
demo.launch() |
|
|