Spaces:

nhop
/

L3Score

Running

App Files Files Community

Niklas Hoepner commited on 2 days ago

Commit

d9007fb

1 Parent(s): 2009d97

Added cost of API calls to output

Browse files

Files changed (3) hide show

L3Score.py +52 -40
README.md +6 -6
requirements.txt +1 -0

L3Score.py CHANGED Viewed

@@ -23,11 +23,10 @@ import os
 import evaluate
 import datasets
 import numpy as np
 import openai
 from langchain.chat_models.base import init_chat_model
 _CITATION = """\
 @article{pramanick2024spiqa,
@@ -55,16 +54,17 @@ Args:
         reference should be a string.
 Returns:
     L3Score: mean L3Score for all (question, prediction, reference) triplets.
 Examples:
     Example 1: High certainty the prediction is the same as the ground-truth.
     >>> L3Score = evaluate.load("L3Score")
     >>> L3Score.compute(questions=["What is the capital of France?"], predictions=["Paris"], references=["Paris"], api_key="your-openai-api-key", provider="openai", model="gpt-4o-mini")
-    {'L3Score': 0.99...}
     Example 2: High certainty the prediction is not the same as the ground-truth.
     >>> L3Score = evaluate.load("L3Score")
     >>> L3Score.compute(questions=["What is the capital of Germany?"], predictions=["Moscow"], references=["Berlin"], api_key="your-openai-api-key", provider="openai", model="gpt-4o-mini")
-    {'L3Score': 0.00...}
 """
@@ -104,14 +104,20 @@ class L3Score(evaluate.Metric):
             codebase_urls=[
                 "https://github.com/google/spiqa/blob/main/metrics/llmlogscore/llmlogscore.py"
             ],
-            reference_urls=["https://arxiv.org/pdf/2407.09413","https://github.com/google/spiqa","https://huggingface.co/datasets/google/spiqa"],
         )
     def _download_and_prepare(self, dl_manager):
         """Optional: download external resources useful to compute the scores"""
         pass
-    def _verify_input(self, questions, predictions, references, provider, api_key, model):
         """Verify the input parameters"""
         if provider not in PROVIDER_WITH_TOP_LOGPROBS:
@@ -120,37 +126,43 @@ class L3Score(evaluate.Metric):
                     PROVIDER_WITH_TOP_LOGPROBS
                 )
             )
         # Check whether the model is available
         if provider == "openai":
             client = openai.OpenAI(api_key=api_key)
             model_names = set([model.id for model in client.models.list()])
             if model not in model_names:
-                raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
         elif provider == "deepseek":
-            client = openai.OpenAI(api_key=api_key,base_url="https://api.deepseek.com")
             model_names = [model.id for model in client.models.list()]
             if model not in model_names:
-                raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
         elif provider == "xai":
             client = openai.OpenAI(api_key=api_key, base_url="https://api.xai.com")
             model_names = [model.id for model in client.models.list()]
             if model not in model_names:
-                raise ValueError(f"Model {model} not found for provider {provider}, available models: {model_names}")
-        assert len(questions) == len(predictions) == len(references), "Questions, predictions and references must have the same length"
     def _get_llm(self, model, api_key):
         """Get the LLM"""
         llm = init_chat_model(model=model, api_key=api_key)
         llm = llm.bind(logprobs=True, top_logprobs=5)
         return llm
     def _compute(
         self,
@@ -162,34 +174,40 @@ class L3Score(evaluate.Metric):
         model="gpt-4o-mini",
     ):
         """Returns the scores"""
         # Check whether llm can be initialized
         try:
-            self._verify_input(questions, predictions, references, provider, api_key, model)
         except ValueError as e:
             return {"error": str(e)}
         except openai.AuthenticationError as e:
             message = e.body["message"]
             return {"error": f"Authentication failed: {message}"}
         except Exception as e:
-            return {"error": f"An error occurred when verifying the provider/model match: {e}"}
         # Initialize the LLM
         llm = self._get_llm(model, api_key)
         L3Score = 0
         count = 0
         for question, prediction, reference in zip(questions, predictions, references):
             try:
                 response = llm.invoke(
                     (
                         "human",
-                        _PROMPT.format(question=question, gt=reference, answer=prediction),
                     )
                 )
             except openai.AuthenticationError as e:
                 message = e.body["message"]
                 return {"error": f"Authentication failed: {message}"}
@@ -214,6 +232,7 @@ class L3Score(evaluate.Metric):
         return {
             "L3Score": L3Score,
         }
     def _calculate_L3Score(self, top_logprobs):
@@ -283,21 +302,14 @@ class L3Score(evaluate.Metric):
         """Remove white space and lower case for normalized comparisons."""
         return text.strip().lower()
-if __name__ == "__main__":
-    questions = ["What is the capital of France?", "What is the capital of Germany?"]
-    predictions = ["Paris", "Moscow"]
-    references = ["Paris", "Berlin"]
-    L3Score_test = L3Score()
-    results = L3Score_test.compute(
-        questions=questions,
-        predictions=predictions,
-        references=references,
-        api_key=os.environ["OPENAI_API_KEY"],
-        provider="deepseek",
-        model="deepseek-coder",
-    )

 import evaluate
 import datasets
 import numpy as np
 import openai
 from langchain.chat_models.base import init_chat_model
+from litellm import model_cost
 _CITATION = """\
 @article{pramanick2024spiqa,
         reference should be a string.
 Returns:
     L3Score: mean L3Score for all (question, prediction, reference) triplets.
+    Cost: total cost of the LLM calls.
 Examples:
     Example 1: High certainty the prediction is the same as the ground-truth.
     >>> L3Score = evaluate.load("L3Score")
     >>> L3Score.compute(questions=["What is the capital of France?"], predictions=["Paris"], references=["Paris"], api_key="your-openai-api-key", provider="openai", model="gpt-4o-mini")
+    {'L3Score': 0.99..., 'Cost': ...}
     Example 2: High certainty the prediction is not the same as the ground-truth.
     >>> L3Score = evaluate.load("L3Score")
     >>> L3Score.compute(questions=["What is the capital of Germany?"], predictions=["Moscow"], references=["Berlin"], api_key="your-openai-api-key", provider="openai", model="gpt-4o-mini")
+    {'L3Score': 0.00..., 'Cost': ...}
 """
             codebase_urls=[
                 "https://github.com/google/spiqa/blob/main/metrics/llmlogscore/llmlogscore.py"
             ],
+            reference_urls=[
+                "https://arxiv.org/pdf/2407.09413",
+                "https://github.com/google/spiqa",
+                "https://huggingface.co/datasets/google/spiqa",
+            ],
         )
     def _download_and_prepare(self, dl_manager):
         """Optional: download external resources useful to compute the scores"""
         pass
+    def _verify_input(
+        self, questions, predictions, references, provider, api_key, model
+    ):
         """Verify the input parameters"""
         if provider not in PROVIDER_WITH_TOP_LOGPROBS:
                     PROVIDER_WITH_TOP_LOGPROBS
                 )
             )
         # Check whether the model is available
         if provider == "openai":
             client = openai.OpenAI(api_key=api_key)
             model_names = set([model.id for model in client.models.list()])
             if model not in model_names:
+                raise ValueError(
+                    f"Model {model} not found for provider {provider}, available models: {model_names}"
+                )
         elif provider == "deepseek":
+            client = openai.OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
             model_names = [model.id for model in client.models.list()]
             if model not in model_names:
+                raise ValueError(
+                    f"Model {model} not found for provider {provider}, available models: {model_names}"
+                )
         elif provider == "xai":
             client = openai.OpenAI(api_key=api_key, base_url="https://api.xai.com")
             model_names = [model.id for model in client.models.list()]
             if model not in model_names:
+                raise ValueError(
+                    f"Model {model} not found for provider {provider}, available models: {model_names}"
+                )
+        assert (
+            len(questions) == len(predictions) == len(references)
+        ), "Questions, predictions and references must have the same length"
     def _get_llm(self, model, api_key):
         """Get the LLM"""
         llm = init_chat_model(model=model, api_key=api_key)
         llm = llm.bind(logprobs=True, top_logprobs=5)
+        self._model_cost = model_cost[llm.model_name]
         return llm
     def _compute(
         self,
         model="gpt-4o-mini",
     ):
         """Returns the scores"""
         # Check whether llm can be initialized
         try:
+            self._verify_input(
+                questions, predictions, references, provider, api_key, model
+            )
         except ValueError as e:
             return {"error": str(e)}
         except openai.AuthenticationError as e:
             message = e.body["message"]
             return {"error": f"Authentication failed: {message}"}
         except Exception as e:
+            return {
+                "error": f"An error occurred when verifying the provider/model match: {e}"
+            }
         # Initialize the LLM
         llm = self._get_llm(model, api_key)
         L3Score = 0
         count = 0
+        total_cost = 0
         for question, prediction, reference in zip(questions, predictions, references):
             try:
                 response = llm.invoke(
                     (
                         "human",
+                        _PROMPT.format(
+                            question=question, gt=reference, answer=prediction
+                        ),
                     )
                 )
+                cost = self._get_cost(response)
+                total_cost += cost
             except openai.AuthenticationError as e:
                 message = e.body["message"]
                 return {"error": f"Authentication failed: {message}"}
         return {
             "L3Score": L3Score,
+            "Cost": total_cost,
         }
     def _calculate_L3Score(self, top_logprobs):
         """Remove white space and lower case for normalized comparisons."""
         return text.strip().lower()
+    def _get_cost(self, response):
+        """Get the cost of the response"""
+        return (
+            self._model_cost["input_cost_per_token"]
+            * response.usage_metadata["input_tokens"]
+            + self._model_cost["output_cost_per_token"]
+            * response.usage_metadata["output_tokens"]
+        )

README.md CHANGED Viewed

@@ -83,7 +83,7 @@ score = l3score.compute(
 )
 print(score)
-# {'L3Score': 0.49...}
 ```
 ---
@@ -103,13 +103,13 @@ print(score)
 ### 📄 Output
-A dictionary with a single key:
 ```python
-{"L3Score": float}
 ```
-The value is the **average score** over all (question, prediction, reference) triplets.
 ---
@@ -126,7 +126,7 @@ score = l3score.compute(
     provider="openai",
     model="gpt-4o-mini"
 )
-# {'L3Score': 0.99...}
 score = l3score.compute(
     questions=["What is the capital of Germany?"],
@@ -136,7 +136,7 @@ score = l3score.compute(
     provider="openai",
     model="gpt-4o-mini"
 )
-# {'L3Score': 0.00...}
 ```
 ---

 )
 print(score)
+# {'L3Score': 0.49..., 'Cost':...}
 ```
 ---
 ### 📄 Output
+A dictionary with a the score and the cost to query the LLM-provider API:
 ```python
+{"L3Score": float, "Cost": float}
 ```
+The value is the **average score** over all (question, prediction, reference) triplets and the total cost of all API calls.
 ---
     provider="openai",
     model="gpt-4o-mini"
 )
+# {'L3Score': 0.99...,'Cost':...}
 score = l3score.compute(
     questions=["What is the capital of Germany?"],
     provider="openai",
     model="gpt-4o-mini"
 )
+# {'L3Score': 0.00...,'Cost':...}
 ```
 ---

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 git+https://github.com/huggingface/evaluate@main
 langchain==0.3.23
 langchain-deepseek==0.1.3
 langchain-openai==0.3.12

 git+https://github.com/huggingface/evaluate@main
+litellm==1.67.0.post1
 langchain==0.3.23
 langchain-deepseek==0.1.3
 langchain-openai==0.3.12