import gradio as gr import evaluate l3score = evaluate.load("nhop/L3Score") def compute_l3score(api_key, provider, model, questions, predictions, references): try: result = l3score.compute( questions=[q.strip() for q in questions.split("\n") if q.strip()], predictions=[p.strip() for p in predictions.split("\n") if p.strip()], references=[r.strip() for r in references.split("\n") if r.strip()], api_key=api_key, provider=provider, model=model ) return result except Exception as e: return {"error": str(e)} with gr.Blocks() as demo: gr.Markdown(r""" # Metric: L3Score """) with gr.Row(): api_key = gr.Textbox(label="API Key", type="password") provider = gr.Dropdown(label="Provider", choices=["openai", "deepseek", "xai"], value="openai") model = gr.Textbox(label="Model", value="gpt-4o-mini") with gr.Row(): questions = gr.Textbox(label="Questions (one per line)", lines=4, placeholder="What is the capital of France?") predictions = gr.Textbox(label="Predictions (one per line)", lines=4, placeholder="Paris") references = gr.Textbox(label="References (one per line)", lines=4, placeholder="Paris") compute_button = gr.Button("Compute L3Score") output = gr.JSON(label="L3Score Result") compute_button.click( fn=compute_l3score, inputs=[api_key, provider, model, questions, predictions, references], outputs=output ) gr.Markdown(r""" ## ๐Ÿ“Œ Description **L3Score** evaluates how semantically close a model-generated answer is to a reference answer for a given question. It prompts a **language model as a judge** using: ```text You are given a question, ground-truth answer, and a candidate answer. Question: {{question}} Ground-truth answer: {{gt}} Candidate answer: {{answer}} Is the semantic meaning of the ground-truth and candidate answers similar? Answer in one word - Yes or No. ``` The model's **log-probabilities** for "Yes" and "No" tokens are used to compute the score. ---""") gr.Markdown(""" ## ๐Ÿงฎ Scoring Logic""") gr.Markdown(r"""Let $l_{\text{yes}}$ and $l_{\text{no}}$ be the log-probabilities of 'Yes' and 'No', respectively.""",latex_delimiters=[ {"left": "$", "right": "$", "display": False }]) gr.Markdown(r""" - If neither token is in the top-5: $$ \text{L3Score} = 0 $$ - If both are present: $$ \text{L3Score} = \frac{\exp(l_{\text{yes}})}{\exp(l_{\text{yes}}) + \exp(l_{\text{no}})} $$ - If only one is present, the missing tokenโ€™s probability is estimated using the minimum of: - remaining probability mass apart from the top-5 tokens - the least likely top-5 token The score ranges from 0 to 1, where 1 indicates the highest confidence by the LLM that the predicted and reference answers are semantically equivalent. See [SPIQA paper](https://arxiv.org/pdf/2407.09413) for details. --- ## ๐Ÿš€ How to Use ```python import evaluate l3score = evaluate.load("nhop/L3Score") questions = ["What is the capital of France?", "What is the capital of Germany?"] predictions = ["Paris", "Moscow"] references = ["Paris", "Berlin"] score = l3score.compute( questions=questions, predictions=predictions, references=references, api_key="your-openai-api-key", provider="openai", model="gpt-4o-mini" ) print(score) # {'L3Score': 0.49..., 'Cost': ...} ``` --- ## ๐Ÿ”  Inputs | Name | Type | Description | |--------------|--------------|-----------------------------------------------------------------------------| | `questions` | `list[str]` | The list of input questions. | | `predictions`| `list[str]` | Generated answers by the model being evaluated. | | `references` | `list[str]` | Ground-truth or reference answers. | | `api_key` | `str` | API key for the selected LLM provider. | | `provider` | `str` | Must support top-n token log-probabilities. **Default**: openai | | `model` | `str` | Name of the evaluation LLM. **Default**: gpt-4o-mini | ## ๐Ÿ“„ Output Calling the `compute` method returns a dictionary containing the L3Score: ```python {"L3Score": float, "Cost": float} ``` The value is the **average score** over all (question, prediction, reference) triplets and the total cost of all API calls. --- ## ๐Ÿ“Š Examples ```python l3score = evaluate.load("nhop/L3Score") score = l3score.compute( questions=["What is the capital of France?"], predictions=["Paris"], references=["Paris"], api_key="your-openai-api-key", provider="openai", model="gpt-4o-mini" ) # {'L3Score': 0.99..., 'Cost': ...} score = l3score.compute( questions=["What is the capital of Germany?"], predictions=["Moscow"], references=["Berlin"], api_key="your-openai-api-key", provider="openai", model="gpt-4o-mini" ) # {'L3Score': 0.00..., 'Cost': ...} ``` --- ## โš ๏ธ Limitations and Bias - Requires models that expose **top-n token log-probabilities** (e.g., OpenAI, DeepSeek, Groq). - Scores are **only comparable when using the same judge model**. ## ๐Ÿ“– Citation ```bibtex @article{pramanick2024spiqa, title={SPIQA: A Dataset for Multimodal Question Answering on Scientific Papers}, author={Pramanick, Shraman and Chellappa, Rama and Venugopalan, Subhashini}, journal={arXiv preprint arXiv:2407.09413}, year={2024} } ``` """) demo.launch()