Niklas Hoepner commited on
Commit
d9007fb
·
1 Parent(s): 2009d97

Added cost of API calls to output

Browse files
Files changed (3) hide show
  1. L3Score.py +52 -40
  2. README.md +6 -6
  3. requirements.txt +1 -0
L3Score.py CHANGED
@@ -23,11 +23,10 @@ import os
23
  import evaluate
24
  import datasets
25
  import numpy as np
26
-
27
  import openai
28
 
29
  from langchain.chat_models.base import init_chat_model
30
-
31
 
32
  _CITATION = """\
33
  @article{pramanick2024spiqa,
@@ -55,16 +54,17 @@ Args:
55
  reference should be a string.
56
  Returns:
57
  L3Score: mean L3Score for all (question, prediction, reference) triplets.
 
58
  Examples:
59
  Example 1: High certainty the prediction is the same as the ground-truth.
60
  >>> L3Score = evaluate.load("L3Score")
61
  >>> L3Score.compute(questions=["What is the capital of France?"], predictions=["Paris"], references=["Paris"], api_key="your-openai-api-key", provider="openai", model="gpt-4o-mini")
62
- {'L3Score': 0.99...}
63
 
64
  Example 2: High certainty the prediction is not the same as the ground-truth.
65
  >>> L3Score = evaluate.load("L3Score")
66
  >>> L3Score.compute(questions=["What is the capital of Germany?"], predictions=["Moscow"], references=["Berlin"], api_key="your-openai-api-key", provider="openai", model="gpt-4o-mini")
67
- {'L3Score': 0.00...}
68
  """
69
 
70
 
@@ -104,14 +104,20 @@ class L3Score(evaluate.Metric):
104
  codebase_urls=[
105
  "https://github.com/google/spiqa/blob/main/metrics/llmlogscore/llmlogscore.py"
106
  ],
107
- reference_urls=["https://arxiv.org/pdf/2407.09413","https://github.com/google/spiqa","https://huggingface.co/datasets/google/spiqa"],
 
 
 
 
108
  )
109
 
110
  def _download_and_prepare(self, dl_manager):
111
  """Optional: download external resources useful to compute the scores"""
112
  pass
113
 
114
- def _verify_input(self, questions, predictions, references, provider, api_key, model):
 
 
115
  """Verify the input parameters"""
116
 
117
  if provider not in PROVIDER_WITH_TOP_LOGPROBS:
@@ -120,37 +126,43 @@ class L3Score(evaluate.Metric):
120
  PROVIDER_WITH_TOP_LOGPROBS
121
  )
122
  )
123
-
124
  # Check whether the model is available
125
-
126
  if provider == "openai":
127
  client = openai.OpenAI(api_key=api_key)
128
  model_names = set([model.id for model in client.models.list()])
129
  if model not in model_names:
130
- raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
131
-
 
 
132
  elif provider == "deepseek":
133
- client = openai.OpenAI(api_key=api_key,base_url="https://api.deepseek.com")
134
  model_names = [model.id for model in client.models.list()]
135
  if model not in model_names:
136
- raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
137
-
 
 
138
  elif provider == "xai":
139
  client = openai.OpenAI(api_key=api_key, base_url="https://api.xai.com")
140
  model_names = [model.id for model in client.models.list()]
141
  if model not in model_names:
142
- raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
143
-
144
-
145
- assert len(questions) == len(predictions) == len(references), "Questions, predictions and references must have the same length"
146
 
 
 
 
147
 
148
  def _get_llm(self, model, api_key):
149
  """Get the LLM"""
150
  llm = init_chat_model(model=model, api_key=api_key)
151
  llm = llm.bind(logprobs=True, top_logprobs=5)
 
152
  return llm
153
-
154
 
155
  def _compute(
156
  self,
@@ -162,34 +174,40 @@ class L3Score(evaluate.Metric):
162
  model="gpt-4o-mini",
163
  ):
164
  """Returns the scores"""
165
-
166
  # Check whether llm can be initialized
167
  try:
168
- self._verify_input(questions, predictions, references, provider, api_key, model)
 
 
169
  except ValueError as e:
170
  return {"error": str(e)}
171
  except openai.AuthenticationError as e:
172
  message = e.body["message"]
173
  return {"error": f"Authentication failed: {message}"}
174
  except Exception as e:
175
- return {"error": f"An error occurred when verifying the provider/model match: {e}"}
 
 
176
 
177
  # Initialize the LLM
178
  llm = self._get_llm(model, api_key)
179
 
180
-
181
  L3Score = 0
182
  count = 0
183
-
184
  for question, prediction, reference in zip(questions, predictions, references):
185
  try:
186
  response = llm.invoke(
187
  (
188
  "human",
189
- _PROMPT.format(question=question, gt=reference, answer=prediction),
 
 
190
  )
191
  )
192
-
 
193
  except openai.AuthenticationError as e:
194
  message = e.body["message"]
195
  return {"error": f"Authentication failed: {message}"}
@@ -214,6 +232,7 @@ class L3Score(evaluate.Metric):
214
 
215
  return {
216
  "L3Score": L3Score,
 
217
  }
218
 
219
  def _calculate_L3Score(self, top_logprobs):
@@ -283,21 +302,14 @@ class L3Score(evaluate.Metric):
283
  """Remove white space and lower case for normalized comparisons."""
284
  return text.strip().lower()
285
 
 
 
 
 
 
 
 
 
286
 
287
- if __name__ == "__main__":
288
-
289
- questions = ["What is the capital of France?", "What is the capital of Germany?"]
290
- predictions = ["Paris", "Moscow"]
291
- references = ["Paris", "Berlin"]
292
 
293
- L3Score_test = L3Score()
294
 
295
- results = L3Score_test.compute(
296
- questions=questions,
297
- predictions=predictions,
298
- references=references,
299
- api_key=os.environ["OPENAI_API_KEY"],
300
- provider="deepseek",
301
- model="deepseek-coder",
302
- )
303
-
 
23
  import evaluate
24
  import datasets
25
  import numpy as np
 
26
  import openai
27
 
28
  from langchain.chat_models.base import init_chat_model
29
+ from litellm import model_cost
30
 
31
  _CITATION = """\
32
  @article{pramanick2024spiqa,
 
54
  reference should be a string.
55
  Returns:
56
  L3Score: mean L3Score for all (question, prediction, reference) triplets.
57
+ Cost: total cost of the LLM calls.
58
  Examples:
59
  Example 1: High certainty the prediction is the same as the ground-truth.
60
  >>> L3Score = evaluate.load("L3Score")
61
  >>> L3Score.compute(questions=["What is the capital of France?"], predictions=["Paris"], references=["Paris"], api_key="your-openai-api-key", provider="openai", model="gpt-4o-mini")
62
+ {'L3Score': 0.99..., 'Cost': ...}
63
 
64
  Example 2: High certainty the prediction is not the same as the ground-truth.
65
  >>> L3Score = evaluate.load("L3Score")
66
  >>> L3Score.compute(questions=["What is the capital of Germany?"], predictions=["Moscow"], references=["Berlin"], api_key="your-openai-api-key", provider="openai", model="gpt-4o-mini")
67
+ {'L3Score': 0.00..., 'Cost': ...}
68
  """
69
 
70
 
 
104
  codebase_urls=[
105
  "https://github.com/google/spiqa/blob/main/metrics/llmlogscore/llmlogscore.py"
106
  ],
107
+ reference_urls=[
108
+ "https://arxiv.org/pdf/2407.09413",
109
+ "https://github.com/google/spiqa",
110
+ "https://huggingface.co/datasets/google/spiqa",
111
+ ],
112
  )
113
 
114
  def _download_and_prepare(self, dl_manager):
115
  """Optional: download external resources useful to compute the scores"""
116
  pass
117
 
118
+ def _verify_input(
119
+ self, questions, predictions, references, provider, api_key, model
120
+ ):
121
  """Verify the input parameters"""
122
 
123
  if provider not in PROVIDER_WITH_TOP_LOGPROBS:
 
126
  PROVIDER_WITH_TOP_LOGPROBS
127
  )
128
  )
129
+
130
  # Check whether the model is available
131
+
132
  if provider == "openai":
133
  client = openai.OpenAI(api_key=api_key)
134
  model_names = set([model.id for model in client.models.list()])
135
  if model not in model_names:
136
+ raise ValueError(
137
+ f"Model {model} not found for provider {provider}, available models: {model_names}"
138
+ )
139
+
140
  elif provider == "deepseek":
141
+ client = openai.OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
142
  model_names = [model.id for model in client.models.list()]
143
  if model not in model_names:
144
+ raise ValueError(
145
+ f"Model {model} not found for provider {provider}, available models: {model_names}"
146
+ )
147
+
148
  elif provider == "xai":
149
  client = openai.OpenAI(api_key=api_key, base_url="https://api.xai.com")
150
  model_names = [model.id for model in client.models.list()]
151
  if model not in model_names:
152
+ raise ValueError(
153
+ f"Model {model} not found for provider {provider}, available models: {model_names}"
154
+ )
 
155
 
156
+ assert (
157
+ len(questions) == len(predictions) == len(references)
158
+ ), "Questions, predictions and references must have the same length"
159
 
160
  def _get_llm(self, model, api_key):
161
  """Get the LLM"""
162
  llm = init_chat_model(model=model, api_key=api_key)
163
  llm = llm.bind(logprobs=True, top_logprobs=5)
164
+ self._model_cost = model_cost[llm.model_name]
165
  return llm
 
166
 
167
  def _compute(
168
  self,
 
174
  model="gpt-4o-mini",
175
  ):
176
  """Returns the scores"""
177
+
178
  # Check whether llm can be initialized
179
  try:
180
+ self._verify_input(
181
+ questions, predictions, references, provider, api_key, model
182
+ )
183
  except ValueError as e:
184
  return {"error": str(e)}
185
  except openai.AuthenticationError as e:
186
  message = e.body["message"]
187
  return {"error": f"Authentication failed: {message}"}
188
  except Exception as e:
189
+ return {
190
+ "error": f"An error occurred when verifying the provider/model match: {e}"
191
+ }
192
 
193
  # Initialize the LLM
194
  llm = self._get_llm(model, api_key)
195
 
 
196
  L3Score = 0
197
  count = 0
198
+ total_cost = 0
199
  for question, prediction, reference in zip(questions, predictions, references):
200
  try:
201
  response = llm.invoke(
202
  (
203
  "human",
204
+ _PROMPT.format(
205
+ question=question, gt=reference, answer=prediction
206
+ ),
207
  )
208
  )
209
+ cost = self._get_cost(response)
210
+ total_cost += cost
211
  except openai.AuthenticationError as e:
212
  message = e.body["message"]
213
  return {"error": f"Authentication failed: {message}"}
 
232
 
233
  return {
234
  "L3Score": L3Score,
235
+ "Cost": total_cost,
236
  }
237
 
238
  def _calculate_L3Score(self, top_logprobs):
 
302
  """Remove white space and lower case for normalized comparisons."""
303
  return text.strip().lower()
304
 
305
+ def _get_cost(self, response):
306
+ """Get the cost of the response"""
307
+ return (
308
+ self._model_cost["input_cost_per_token"]
309
+ * response.usage_metadata["input_tokens"]
310
+ + self._model_cost["output_cost_per_token"]
311
+ * response.usage_metadata["output_tokens"]
312
+ )
313
 
 
 
 
 
 
314
 
 
315
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -83,7 +83,7 @@ score = l3score.compute(
83
  )
84
 
85
  print(score)
86
- # {'L3Score': 0.49...}
87
  ```
88
 
89
  ---
@@ -103,13 +103,13 @@ print(score)
103
 
104
  ### 📄 Output
105
 
106
- A dictionary with a single key:
107
 
108
  ```python
109
- {"L3Score": float}
110
  ```
111
 
112
- The value is the **average score** over all (question, prediction, reference) triplets.
113
 
114
  ---
115
 
@@ -126,7 +126,7 @@ score = l3score.compute(
126
  provider="openai",
127
  model="gpt-4o-mini"
128
  )
129
- # {'L3Score': 0.99...}
130
 
131
  score = l3score.compute(
132
  questions=["What is the capital of Germany?"],
@@ -136,7 +136,7 @@ score = l3score.compute(
136
  provider="openai",
137
  model="gpt-4o-mini"
138
  )
139
- # {'L3Score': 0.00...}
140
  ```
141
 
142
  ---
 
83
  )
84
 
85
  print(score)
86
+ # {'L3Score': 0.49..., 'Cost':...}
87
  ```
88
 
89
  ---
 
103
 
104
  ### 📄 Output
105
 
106
+ A dictionary with a the score and the cost to query the LLM-provider API:
107
 
108
  ```python
109
+ {"L3Score": float, "Cost": float}
110
  ```
111
 
112
+ The value is the **average score** over all (question, prediction, reference) triplets and the total cost of all API calls.
113
 
114
  ---
115
 
 
126
  provider="openai",
127
  model="gpt-4o-mini"
128
  )
129
+ # {'L3Score': 0.99...,'Cost':...}
130
 
131
  score = l3score.compute(
132
  questions=["What is the capital of Germany?"],
 
136
  provider="openai",
137
  model="gpt-4o-mini"
138
  )
139
+ # {'L3Score': 0.00...,'Cost':...}
140
  ```
141
 
142
  ---
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  git+https://github.com/huggingface/evaluate@main
 
2
  langchain==0.3.23
3
  langchain-deepseek==0.1.3
4
  langchain-openai==0.3.12
 
1
  git+https://github.com/huggingface/evaluate@main
2
+ litellm==1.67.0.post1
3
  langchain==0.3.23
4
  langchain-deepseek==0.1.3
5
  langchain-openai==0.3.12