Niklas Hoepner
commited on
Commit
·
d9007fb
1
Parent(s):
2009d97
Added cost of API calls to output
Browse files- L3Score.py +52 -40
- README.md +6 -6
- requirements.txt +1 -0
L3Score.py
CHANGED
@@ -23,11 +23,10 @@ import os
|
|
23 |
import evaluate
|
24 |
import datasets
|
25 |
import numpy as np
|
26 |
-
|
27 |
import openai
|
28 |
|
29 |
from langchain.chat_models.base import init_chat_model
|
30 |
-
|
31 |
|
32 |
_CITATION = """\
|
33 |
@article{pramanick2024spiqa,
|
@@ -55,16 +54,17 @@ Args:
|
|
55 |
reference should be a string.
|
56 |
Returns:
|
57 |
L3Score: mean L3Score for all (question, prediction, reference) triplets.
|
|
|
58 |
Examples:
|
59 |
Example 1: High certainty the prediction is the same as the ground-truth.
|
60 |
>>> L3Score = evaluate.load("L3Score")
|
61 |
>>> L3Score.compute(questions=["What is the capital of France?"], predictions=["Paris"], references=["Paris"], api_key="your-openai-api-key", provider="openai", model="gpt-4o-mini")
|
62 |
-
{'L3Score': 0.99...}
|
63 |
|
64 |
Example 2: High certainty the prediction is not the same as the ground-truth.
|
65 |
>>> L3Score = evaluate.load("L3Score")
|
66 |
>>> L3Score.compute(questions=["What is the capital of Germany?"], predictions=["Moscow"], references=["Berlin"], api_key="your-openai-api-key", provider="openai", model="gpt-4o-mini")
|
67 |
-
{'L3Score': 0.00...}
|
68 |
"""
|
69 |
|
70 |
|
@@ -104,14 +104,20 @@ class L3Score(evaluate.Metric):
|
|
104 |
codebase_urls=[
|
105 |
"https://github.com/google/spiqa/blob/main/metrics/llmlogscore/llmlogscore.py"
|
106 |
],
|
107 |
-
reference_urls=[
|
|
|
|
|
|
|
|
|
108 |
)
|
109 |
|
110 |
def _download_and_prepare(self, dl_manager):
|
111 |
"""Optional: download external resources useful to compute the scores"""
|
112 |
pass
|
113 |
|
114 |
-
def _verify_input(
|
|
|
|
|
115 |
"""Verify the input parameters"""
|
116 |
|
117 |
if provider not in PROVIDER_WITH_TOP_LOGPROBS:
|
@@ -120,37 +126,43 @@ class L3Score(evaluate.Metric):
|
|
120 |
PROVIDER_WITH_TOP_LOGPROBS
|
121 |
)
|
122 |
)
|
123 |
-
|
124 |
# Check whether the model is available
|
125 |
-
|
126 |
if provider == "openai":
|
127 |
client = openai.OpenAI(api_key=api_key)
|
128 |
model_names = set([model.id for model in client.models.list()])
|
129 |
if model not in model_names:
|
130 |
-
raise ValueError(
|
131 |
-
|
|
|
|
|
132 |
elif provider == "deepseek":
|
133 |
-
client = openai.OpenAI(api_key=api_key,base_url="https://api.deepseek.com")
|
134 |
model_names = [model.id for model in client.models.list()]
|
135 |
if model not in model_names:
|
136 |
-
raise ValueError(
|
137 |
-
|
|
|
|
|
138 |
elif provider == "xai":
|
139 |
client = openai.OpenAI(api_key=api_key, base_url="https://api.xai.com")
|
140 |
model_names = [model.id for model in client.models.list()]
|
141 |
if model not in model_names:
|
142 |
-
raise ValueError(
|
143 |
-
|
144 |
-
|
145 |
-
assert len(questions) == len(predictions) == len(references), "Questions, predictions and references must have the same length"
|
146 |
|
|
|
|
|
|
|
147 |
|
148 |
def _get_llm(self, model, api_key):
|
149 |
"""Get the LLM"""
|
150 |
llm = init_chat_model(model=model, api_key=api_key)
|
151 |
llm = llm.bind(logprobs=True, top_logprobs=5)
|
|
|
152 |
return llm
|
153 |
-
|
154 |
|
155 |
def _compute(
|
156 |
self,
|
@@ -162,34 +174,40 @@ class L3Score(evaluate.Metric):
|
|
162 |
model="gpt-4o-mini",
|
163 |
):
|
164 |
"""Returns the scores"""
|
165 |
-
|
166 |
# Check whether llm can be initialized
|
167 |
try:
|
168 |
-
self._verify_input(
|
|
|
|
|
169 |
except ValueError as e:
|
170 |
return {"error": str(e)}
|
171 |
except openai.AuthenticationError as e:
|
172 |
message = e.body["message"]
|
173 |
return {"error": f"Authentication failed: {message}"}
|
174 |
except Exception as e:
|
175 |
-
return {
|
|
|
|
|
176 |
|
177 |
# Initialize the LLM
|
178 |
llm = self._get_llm(model, api_key)
|
179 |
|
180 |
-
|
181 |
L3Score = 0
|
182 |
count = 0
|
183 |
-
|
184 |
for question, prediction, reference in zip(questions, predictions, references):
|
185 |
try:
|
186 |
response = llm.invoke(
|
187 |
(
|
188 |
"human",
|
189 |
-
_PROMPT.format(
|
|
|
|
|
190 |
)
|
191 |
)
|
192 |
-
|
|
|
193 |
except openai.AuthenticationError as e:
|
194 |
message = e.body["message"]
|
195 |
return {"error": f"Authentication failed: {message}"}
|
@@ -214,6 +232,7 @@ class L3Score(evaluate.Metric):
|
|
214 |
|
215 |
return {
|
216 |
"L3Score": L3Score,
|
|
|
217 |
}
|
218 |
|
219 |
def _calculate_L3Score(self, top_logprobs):
|
@@ -283,21 +302,14 @@ class L3Score(evaluate.Metric):
|
|
283 |
"""Remove white space and lower case for normalized comparisons."""
|
284 |
return text.strip().lower()
|
285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
|
287 |
-
if __name__ == "__main__":
|
288 |
-
|
289 |
-
questions = ["What is the capital of France?", "What is the capital of Germany?"]
|
290 |
-
predictions = ["Paris", "Moscow"]
|
291 |
-
references = ["Paris", "Berlin"]
|
292 |
|
293 |
-
L3Score_test = L3Score()
|
294 |
|
295 |
-
results = L3Score_test.compute(
|
296 |
-
questions=questions,
|
297 |
-
predictions=predictions,
|
298 |
-
references=references,
|
299 |
-
api_key=os.environ["OPENAI_API_KEY"],
|
300 |
-
provider="deepseek",
|
301 |
-
model="deepseek-coder",
|
302 |
-
)
|
303 |
-
|
|
|
23 |
import evaluate
|
24 |
import datasets
|
25 |
import numpy as np
|
|
|
26 |
import openai
|
27 |
|
28 |
from langchain.chat_models.base import init_chat_model
|
29 |
+
from litellm import model_cost
|
30 |
|
31 |
_CITATION = """\
|
32 |
@article{pramanick2024spiqa,
|
|
|
54 |
reference should be a string.
|
55 |
Returns:
|
56 |
L3Score: mean L3Score for all (question, prediction, reference) triplets.
|
57 |
+
Cost: total cost of the LLM calls.
|
58 |
Examples:
|
59 |
Example 1: High certainty the prediction is the same as the ground-truth.
|
60 |
>>> L3Score = evaluate.load("L3Score")
|
61 |
>>> L3Score.compute(questions=["What is the capital of France?"], predictions=["Paris"], references=["Paris"], api_key="your-openai-api-key", provider="openai", model="gpt-4o-mini")
|
62 |
+
{'L3Score': 0.99..., 'Cost': ...}
|
63 |
|
64 |
Example 2: High certainty the prediction is not the same as the ground-truth.
|
65 |
>>> L3Score = evaluate.load("L3Score")
|
66 |
>>> L3Score.compute(questions=["What is the capital of Germany?"], predictions=["Moscow"], references=["Berlin"], api_key="your-openai-api-key", provider="openai", model="gpt-4o-mini")
|
67 |
+
{'L3Score': 0.00..., 'Cost': ...}
|
68 |
"""
|
69 |
|
70 |
|
|
|
104 |
codebase_urls=[
|
105 |
"https://github.com/google/spiqa/blob/main/metrics/llmlogscore/llmlogscore.py"
|
106 |
],
|
107 |
+
reference_urls=[
|
108 |
+
"https://arxiv.org/pdf/2407.09413",
|
109 |
+
"https://github.com/google/spiqa",
|
110 |
+
"https://huggingface.co/datasets/google/spiqa",
|
111 |
+
],
|
112 |
)
|
113 |
|
114 |
def _download_and_prepare(self, dl_manager):
|
115 |
"""Optional: download external resources useful to compute the scores"""
|
116 |
pass
|
117 |
|
118 |
+
def _verify_input(
|
119 |
+
self, questions, predictions, references, provider, api_key, model
|
120 |
+
):
|
121 |
"""Verify the input parameters"""
|
122 |
|
123 |
if provider not in PROVIDER_WITH_TOP_LOGPROBS:
|
|
|
126 |
PROVIDER_WITH_TOP_LOGPROBS
|
127 |
)
|
128 |
)
|
129 |
+
|
130 |
# Check whether the model is available
|
131 |
+
|
132 |
if provider == "openai":
|
133 |
client = openai.OpenAI(api_key=api_key)
|
134 |
model_names = set([model.id for model in client.models.list()])
|
135 |
if model not in model_names:
|
136 |
+
raise ValueError(
|
137 |
+
f"Model {model} not found for provider {provider}, available models: {model_names}"
|
138 |
+
)
|
139 |
+
|
140 |
elif provider == "deepseek":
|
141 |
+
client = openai.OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
|
142 |
model_names = [model.id for model in client.models.list()]
|
143 |
if model not in model_names:
|
144 |
+
raise ValueError(
|
145 |
+
f"Model {model} not found for provider {provider}, available models: {model_names}"
|
146 |
+
)
|
147 |
+
|
148 |
elif provider == "xai":
|
149 |
client = openai.OpenAI(api_key=api_key, base_url="https://api.xai.com")
|
150 |
model_names = [model.id for model in client.models.list()]
|
151 |
if model not in model_names:
|
152 |
+
raise ValueError(
|
153 |
+
f"Model {model} not found for provider {provider}, available models: {model_names}"
|
154 |
+
)
|
|
|
155 |
|
156 |
+
assert (
|
157 |
+
len(questions) == len(predictions) == len(references)
|
158 |
+
), "Questions, predictions and references must have the same length"
|
159 |
|
160 |
def _get_llm(self, model, api_key):
|
161 |
"""Get the LLM"""
|
162 |
llm = init_chat_model(model=model, api_key=api_key)
|
163 |
llm = llm.bind(logprobs=True, top_logprobs=5)
|
164 |
+
self._model_cost = model_cost[llm.model_name]
|
165 |
return llm
|
|
|
166 |
|
167 |
def _compute(
|
168 |
self,
|
|
|
174 |
model="gpt-4o-mini",
|
175 |
):
|
176 |
"""Returns the scores"""
|
177 |
+
|
178 |
# Check whether llm can be initialized
|
179 |
try:
|
180 |
+
self._verify_input(
|
181 |
+
questions, predictions, references, provider, api_key, model
|
182 |
+
)
|
183 |
except ValueError as e:
|
184 |
return {"error": str(e)}
|
185 |
except openai.AuthenticationError as e:
|
186 |
message = e.body["message"]
|
187 |
return {"error": f"Authentication failed: {message}"}
|
188 |
except Exception as e:
|
189 |
+
return {
|
190 |
+
"error": f"An error occurred when verifying the provider/model match: {e}"
|
191 |
+
}
|
192 |
|
193 |
# Initialize the LLM
|
194 |
llm = self._get_llm(model, api_key)
|
195 |
|
|
|
196 |
L3Score = 0
|
197 |
count = 0
|
198 |
+
total_cost = 0
|
199 |
for question, prediction, reference in zip(questions, predictions, references):
|
200 |
try:
|
201 |
response = llm.invoke(
|
202 |
(
|
203 |
"human",
|
204 |
+
_PROMPT.format(
|
205 |
+
question=question, gt=reference, answer=prediction
|
206 |
+
),
|
207 |
)
|
208 |
)
|
209 |
+
cost = self._get_cost(response)
|
210 |
+
total_cost += cost
|
211 |
except openai.AuthenticationError as e:
|
212 |
message = e.body["message"]
|
213 |
return {"error": f"Authentication failed: {message}"}
|
|
|
232 |
|
233 |
return {
|
234 |
"L3Score": L3Score,
|
235 |
+
"Cost": total_cost,
|
236 |
}
|
237 |
|
238 |
def _calculate_L3Score(self, top_logprobs):
|
|
|
302 |
"""Remove white space and lower case for normalized comparisons."""
|
303 |
return text.strip().lower()
|
304 |
|
305 |
+
def _get_cost(self, response):
|
306 |
+
"""Get the cost of the response"""
|
307 |
+
return (
|
308 |
+
self._model_cost["input_cost_per_token"]
|
309 |
+
* response.usage_metadata["input_tokens"]
|
310 |
+
+ self._model_cost["output_cost_per_token"]
|
311 |
+
* response.usage_metadata["output_tokens"]
|
312 |
+
)
|
313 |
|
|
|
|
|
|
|
|
|
|
|
314 |
|
|
|
315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -83,7 +83,7 @@ score = l3score.compute(
|
|
83 |
)
|
84 |
|
85 |
print(score)
|
86 |
-
# {'L3Score': 0.49
|
87 |
```
|
88 |
|
89 |
---
|
@@ -103,13 +103,13 @@ print(score)
|
|
103 |
|
104 |
### 📄 Output
|
105 |
|
106 |
-
A dictionary with a
|
107 |
|
108 |
```python
|
109 |
-
{"L3Score": float}
|
110 |
```
|
111 |
|
112 |
-
The value is the **average score** over all (question, prediction, reference) triplets.
|
113 |
|
114 |
---
|
115 |
|
@@ -126,7 +126,7 @@ score = l3score.compute(
|
|
126 |
provider="openai",
|
127 |
model="gpt-4o-mini"
|
128 |
)
|
129 |
-
# {'L3Score': 0.99
|
130 |
|
131 |
score = l3score.compute(
|
132 |
questions=["What is the capital of Germany?"],
|
@@ -136,7 +136,7 @@ score = l3score.compute(
|
|
136 |
provider="openai",
|
137 |
model="gpt-4o-mini"
|
138 |
)
|
139 |
-
# {'L3Score': 0.00
|
140 |
```
|
141 |
|
142 |
---
|
|
|
83 |
)
|
84 |
|
85 |
print(score)
|
86 |
+
# {'L3Score': 0.49..., 'Cost':...}
|
87 |
```
|
88 |
|
89 |
---
|
|
|
103 |
|
104 |
### 📄 Output
|
105 |
|
106 |
+
A dictionary with a the score and the cost to query the LLM-provider API:
|
107 |
|
108 |
```python
|
109 |
+
{"L3Score": float, "Cost": float}
|
110 |
```
|
111 |
|
112 |
+
The value is the **average score** over all (question, prediction, reference) triplets and the total cost of all API calls.
|
113 |
|
114 |
---
|
115 |
|
|
|
126 |
provider="openai",
|
127 |
model="gpt-4o-mini"
|
128 |
)
|
129 |
+
# {'L3Score': 0.99...,'Cost':...}
|
130 |
|
131 |
score = l3score.compute(
|
132 |
questions=["What is the capital of Germany?"],
|
|
|
136 |
provider="openai",
|
137 |
model="gpt-4o-mini"
|
138 |
)
|
139 |
+
# {'L3Score': 0.00...,'Cost':...}
|
140 |
```
|
141 |
|
142 |
---
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
git+https://github.com/huggingface/evaluate@main
|
|
|
2 |
langchain==0.3.23
|
3 |
langchain-deepseek==0.1.3
|
4 |
langchain-openai==0.3.12
|
|
|
1 |
git+https://github.com/huggingface/evaluate@main
|
2 |
+
litellm==1.67.0.post1
|
3 |
langchain==0.3.23
|
4 |
langchain-deepseek==0.1.3
|
5 |
langchain-openai==0.3.12
|