File size: 6,243 Bytes
26a4d51 21ab6f3 e55522a 21ab6f3 26a4d51 0ca5bff 7e0c731 0ca5bff 3adfe4c 26a4d51 38d8655 26a4d51 e40a01a 38d8655 26a4d51 3adfe4c 26a4d51 7e0c731 f0d7015 26a4d51 7e0c731 26a4d51 7e0c731 26a4d51 7e0c731 26a4d51 4718ad2 26a4d51 3adfe4c 26a4d51 3adfe4c 26a4d51 4718ad2 26a4d51 4718ad2 26a4d51 7e0c731 f0d7015 7e0c731 4718ad2 7e0c731 4718ad2 7e0c731 26a4d51 afabd8a 26a4d51 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import gradio as gr
import evaluate
l3score = evaluate.load("nhop/L3Score")
def compute_l3score(api_key, provider, model, questions, predictions, references):
try:
result = l3score.compute(
questions=[q.strip() for q in questions.split("\n") if q.strip()],
predictions=[p.strip() for p in predictions.split("\n") if p.strip()],
references=[r.strip() for r in references.split("\n") if r.strip()],
api_key=api_key,
provider=provider,
model=model
)
return result
except Exception as e:
return {"error": str(e)}
with gr.Blocks() as demo:
gr.Markdown(r"""
# Metric: L3Score
""")
with gr.Row():
api_key = gr.Textbox(label="API Key", type="password")
provider = gr.Dropdown(label="Provider", choices=["openai", "deepseek", "xai"], value="openai")
model = gr.Textbox(label="Model", value="gpt-4o-mini")
with gr.Row():
questions = gr.Textbox(label="Questions (one per line)", lines=4, placeholder="What is the capital of France?")
predictions = gr.Textbox(label="Predictions (one per line)", lines=4, placeholder="Paris")
references = gr.Textbox(label="References (one per line)", lines=4, placeholder="Paris")
compute_button = gr.Button("Compute L3Score")
output = gr.JSON(label="L3Score Result")
compute_button.click(
fn=compute_l3score,
inputs=[api_key, provider, model, questions, predictions, references],
outputs=output
)
gr.Markdown(r"""
## ๐ Description
**L3Score** evaluates how semantically close a model-generated answer is to a reference answer for a given question. It prompts a **language model as a judge** using:
```text
You are given a question, ground-truth answer, and a candidate answer.
Question: {{question}}
Ground-truth answer: {{gt}}
Candidate answer: {{answer}}
Is the semantic meaning of the ground-truth and candidate answers similar?
Answer in one word - Yes or No.
```
The model's **log-probabilities** for "Yes" and "No" tokens are used to compute the score.
---""")
gr.Markdown(""" ## ๐งฎ Scoring Logic""")
gr.Markdown(r"""Let $l_{\text{yes}}$ and $l_{\text{no}}$ be the log-probabilities of 'Yes' and 'No', respectively.""",latex_delimiters=[ {"left": "$", "right": "$", "display": False }])
gr.Markdown(r"""
- If neither token is in the top-5:
$$
\text{L3Score} = 0
$$
- If both are present:
$$
\text{L3Score} = \frac{\exp(l_{\text{yes}})}{\exp(l_{\text{yes}}) + \exp(l_{\text{no}})}
$$
- If only one is present, the missing tokenโs probability is estimated using the minimum of:
- remaining probability mass apart from the top-5 tokens
- the least likely top-5 token
The score ranges from 0 to 1, where 1 indicates the highest confidence by the LLM that the predicted and reference answers are semantically equivalent.
See [SPIQA paper](https://arxiv.org/pdf/2407.09413) for details.
---
## ๐ How to Use
```python
import evaluate
l3score = evaluate.load("nhop/L3Score")
questions = ["What is the capital of France?", "What is the capital of Germany?"]
predictions = ["Paris", "Moscow"]
references = ["Paris", "Berlin"]
score = l3score.compute(
questions=questions,
predictions=predictions,
references=references,
api_key="your-openai-api-key",
provider="openai",
model="gpt-4o-mini"
)
print(score)
# {'L3Score': 0.49..., 'Cost': ...}
```
---
## ๐ Inputs
| Name | Type | Description |
|--------------|--------------|-----------------------------------------------------------------------------|
| `questions` | `list[str]` | The list of input questions. |
| `predictions`| `list[str]` | Generated answers by the model being evaluated. |
| `references` | `list[str]` | Ground-truth or reference answers. |
| `api_key` | `str` | API key for the selected LLM provider. |
| `provider` | `str` | Must support top-n token log-probabilities. **Default**: openai |
| `model` | `str` | Name of the evaluation LLM. **Default**: gpt-4o-mini |
## ๐ Output
Calling the `compute` method returns a dictionary containing the L3Score:
```python
{"L3Score": float, "Cost": float}
```
The value is the **average score** over all (question, prediction, reference) triplets and the total cost of all API calls.
---
## ๐ Examples
```python
l3score = evaluate.load("nhop/L3Score")
score = l3score.compute(
questions=["What is the capital of France?"],
predictions=["Paris"],
references=["Paris"],
api_key="your-openai-api-key",
provider="openai",
model="gpt-4o-mini"
)
# {'L3Score': 0.99..., 'Cost': ...}
score = l3score.compute(
questions=["What is the capital of Germany?"],
predictions=["Moscow"],
references=["Berlin"],
api_key="your-openai-api-key",
provider="openai",
model="gpt-4o-mini"
)
# {'L3Score': 0.00..., 'Cost': ...}
```
---
## โ ๏ธ Limitations and Bias
- Requires models that expose **top-n token log-probabilities** (e.g., OpenAI, DeepSeek, Groq).
- Scores are **only comparable when using the same judge model**.
## ๐ Citation
```bibtex
@article{pramanick2024spiqa,
title={SPIQA: A Dataset for Multimodal Question Answering on Scientific Papers},
author={Pramanick, Shraman and Chellappa, Rama and Venugopalan, Subhashini},
journal={arXiv preprint arXiv:2407.09413},
year={2024}
}
```
""")
demo.launch()
|