Implemened L3Score from SPIQA datase paper
Browse files- L3Score.py +1 -3
- README.md +1 -1
L3Score.py
CHANGED
|
@@ -120,8 +120,6 @@ class L3Score(evaluate.Metric):
|
|
| 120 |
)
|
| 121 |
)
|
| 122 |
|
| 123 |
-
if api_key == "":
|
| 124 |
-
raise ValueError("api_key is required")
|
| 125 |
|
| 126 |
def _get_llm(self, model, api_key):
|
| 127 |
"""Get the LLM"""
|
|
@@ -134,7 +132,7 @@ class L3Score(evaluate.Metric):
|
|
| 134 |
questions,
|
| 135 |
predictions,
|
| 136 |
references,
|
| 137 |
-
api_key
|
| 138 |
provider="openai",
|
| 139 |
model="gpt-4o-mini",
|
| 140 |
):
|
|
|
|
| 120 |
)
|
| 121 |
)
|
| 122 |
|
|
|
|
|
|
|
| 123 |
|
| 124 |
def _get_llm(self, model, api_key):
|
| 125 |
"""Get the LLM"""
|
|
|
|
| 132 |
questions,
|
| 133 |
predictions,
|
| 134 |
references,
|
| 135 |
+
api_key,
|
| 136 |
provider="openai",
|
| 137 |
model="gpt-4o-mini",
|
| 138 |
):
|
README.md
CHANGED
|
@@ -13,7 +13,7 @@ description: >
|
|
| 13 |
It uses log-probabilities of "Yes"/"No" tokens from a language model acting as a judge.
|
| 14 |
Based on the SPIQA benchmark: https://arxiv.org/pdf/2407.09413
|
| 15 |
sdk: gradio
|
| 16 |
-
sdk_version:
|
| 17 |
app_file: app.py
|
| 18 |
pinned: false
|
| 19 |
---
|
|
|
|
| 13 |
It uses log-probabilities of "Yes"/"No" tokens from a language model acting as a judge.
|
| 14 |
Based on the SPIQA benchmark: https://arxiv.org/pdf/2407.09413
|
| 15 |
sdk: gradio
|
| 16 |
+
sdk_version: 4.44.1
|
| 17 |
app_file: app.py
|
| 18 |
pinned: false
|
| 19 |
---
|