Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Mar 19

Commit

d346c89

verified ·

1 Parent(s): 35fffae

Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

README.md +4 -4
api.py +5 -3
dataset.py +10 -3
error_utils.py +1 -0
inference.py +184 -66
llm_as_judge.py +715 -61
llm_as_judge_chat_templates.py +5 -3
llm_as_judge_constants.py +24 -44
llm_as_judge_utils.py +0 -9
loaders.py +36 -31
metrics.py +193 -56
schema.py +19 -2
serializers.py +2 -1
settings_utils.py +1 -0
sql_utils.py +197 -7
struct_data_operators.py +9 -9
version.py +1 -1

README.md CHANGED Viewed

@@ -40,11 +40,11 @@ https://github.com/IBM/unitxt/assets/23455264/baef9131-39d4-4164-90b2-05da52919f
 ### 🦄 Currently on Unitxt Catalog
-![Abstract Tasks](https://img.shields.io/badge/Abstract_Tasks-62-blue)
-![Dataset Cards](https://img.shields.io/badge/Dataset_Cards-3025-blue)
 ![Templates](https://img.shields.io/badge/Templates-342-blue)
-![Benchmarks](https://img.shields.io/badge/Benchmarks-4-blue)
-![Metrics](https://img.shields.io/badge/Metrics-422-blue)
 ### 🦄 Run Unitxt Exploration Dashboard

 ### 🦄 Currently on Unitxt Catalog
+![Abstract Tasks](https://img.shields.io/badge/Abstract_Tasks-64-blue)
+![Dataset Cards](https://img.shields.io/badge/Dataset_Cards-3174-blue)
 ![Templates](https://img.shields.io/badge/Templates-342-blue)
+![Benchmarks](https://img.shields.io/badge/Benchmarks-6-blue)
+![Metrics](https://img.shields.io/badge/Metrics-462-blue)
 ### 🦄 Run Unitxt Exploration Dashboard

api.py CHANGED Viewed

@@ -21,7 +21,7 @@ from .loaders import LoadFromDictionary
 from .logging_utils import get_logger
 from .metric_utils import EvaluationResults, _compute, _inference_post_process
 from .operator import SourceOperator
-from .schema import loads_instance
 from .settings_utils import get_constants, get_settings
 from .standard import DatasetRecipe
 from .task import Task
@@ -98,6 +98,7 @@ def create_dataset(
     train_set: Optional[List[Dict[Any, Any]]] = None,
     validation_set: Optional[List[Dict[Any, Any]]] = None,
     split: Optional[str] = None,
     **kwargs,
 ) -> Union[DatasetDict, IterableDatasetDict, Dataset, IterableDataset]:
     """Creates dataset from input data based on a specific task.
@@ -108,6 +109,7 @@ def create_dataset(
         train_set : optional train_set
         validation_set: optional validation set
         split: optional one split to choose
         **kwargs: Arguments used to load dataset from provided datasets (see load_dataset())
     Returns:
@@ -129,7 +131,7 @@ def create_dataset(
             f"No 'template' was passed to the create_dataset() and the given task ('{task.__id__}') has no 'default_template' field."
         )
-    card = TaskCard(loader=LoadFromDictionary(data=data), task=task)
     return load_dataset(card=card, split=split, **kwargs)
@@ -283,7 +285,7 @@ def produce(
     result = _get_produce_with_cache(dataset_query, **kwargs)(instance_or_instances)
     if not is_list:
         return result[0]
-    return Dataset.from_list(result).with_transform(loads_instance)
 def infer(

 from .logging_utils import get_logger
 from .metric_utils import EvaluationResults, _compute, _inference_post_process
 from .operator import SourceOperator
+from .schema import loads_batch
 from .settings_utils import get_constants, get_settings
 from .standard import DatasetRecipe
 from .task import Task
     train_set: Optional[List[Dict[Any, Any]]] = None,
     validation_set: Optional[List[Dict[Any, Any]]] = None,
     split: Optional[str] = None,
+    data_classification_policy:  Optional[List[str]] = None,
     **kwargs,
 ) -> Union[DatasetDict, IterableDatasetDict, Dataset, IterableDataset]:
     """Creates dataset from input data based on a specific task.
         train_set : optional train_set
         validation_set: optional validation set
         split: optional one split to choose
+        data_classification_policy: data_classification_policy
         **kwargs: Arguments used to load dataset from provided datasets (see load_dataset())
     Returns:
             f"No 'template' was passed to the create_dataset() and the given task ('{task.__id__}') has no 'default_template' field."
         )
+    card = TaskCard(loader=LoadFromDictionary(data=data, data_classification_policy=data_classification_policy), task=task)
     return load_dataset(card=card, split=split, **kwargs)
     result = _get_produce_with_cache(dataset_query, **kwargs)(instance_or_instances)
     if not is_list:
         return result[0]
+    return Dataset.from_list(result).with_transform(loads_batch)
 def infer(

dataset.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from typing import Optional, Union
 import datasets
@@ -50,7 +50,7 @@ from .random_utils import __file__ as _
 from .recipe import __file__ as _
 from .register import __file__ as _
 from .schema import __file__ as _
-from .schema import loads_instance
 from .serializers import __file__ as _
 from .settings_utils import __file__ as _
 from .settings_utils import get_constants
@@ -120,6 +120,13 @@ class Dataset(datasets.GeneratorBasedBuilder):
             dl_manager, "no_checks", **prepare_splits_kwargs
         )
     def as_dataset(
         self,
         split: Optional[datasets.Split] = None,
@@ -162,5 +169,5 @@ class Dataset(datasets.GeneratorBasedBuilder):
         return (
             super()
             .as_dataset(split, run_post_process, verification_mode, in_memory)
-            .with_transform(loads_instance)
         )

 import os
+from typing import Dict, Optional, Union
 import datasets
 from .recipe import __file__ as _
 from .register import __file__ as _
 from .schema import __file__ as _
+from .schema import loads_batch, loads_instance
 from .serializers import __file__ as _
 from .settings_utils import __file__ as _
 from .settings_utils import get_constants
             dl_manager, "no_checks", **prepare_splits_kwargs
         )
+    def as_streaming_dataset(self, split: Optional[str] = None, base_path: Optional[str] = None) -> Union[Dict[str, datasets.IterableDataset], datasets.IterableDataset]:
+        return (
+            super()
+            .as_streaming_dataset(split, base_path=base_path)
+            .map(loads_instance)
+        )
     def as_dataset(
         self,
         split: Optional[datasets.Split] = None,
         return (
             super()
             .as_dataset(split, run_post_process, verification_mode, in_memory)
+            .with_transform(loads_batch)
         )

error_utils.py CHANGED Viewed

@@ -18,6 +18,7 @@ class Documentation:
     BENCHMARKS = "docs/benchmark.html"
     DATA_CLASSIFICATION_POLICY = "docs/data_classification_policy.html"
     CATALOG = "docs/saving_and_loading_from_catalog.html"
 def additional_info(path: str) -> str:

     BENCHMARKS = "docs/benchmark.html"
     DATA_CLASSIFICATION_POLICY = "docs/data_classification_policy.html"
     CATALOG = "docs/saving_and_loading_from_catalog.html"
+    SETTINGS = "docs/settings.html"
 def additional_info(path: str) -> str:

inference.py CHANGED Viewed

@@ -2,6 +2,7 @@ import abc
 import asyncio
 import base64
 import dataclasses
 import io
 import json
 import logging
@@ -12,6 +13,7 @@ import time
 import uuid
 from collections import Counter
 from datetime import datetime
 from multiprocessing.pool import ThreadPool
 from typing import (
     Any,
@@ -29,6 +31,7 @@ from typing import (
 )
 from datasets import Dataset, DatasetDict, Image
 from tqdm import tqdm, trange
 from tqdm.asyncio import tqdm_asyncio
@@ -53,6 +56,11 @@ settings = get_settings()
 logger = get_logger()
 class StandardAPIParamsMixin(Artifact):
     model: str
     frequency_penalty: Optional[float] = None
@@ -149,6 +157,8 @@ class ListWithMetadata(List[T]):
 class InferenceEngine(Artifact):
     """Abstract base class for inference."""
     @abc.abstractmethod
     def _infer(
@@ -173,6 +183,7 @@ class InferenceEngine(Artifact):
         if not settings.mock_inference_mode:
             super().prepare()  # no need to prepare a mock
             self.prepare_engine()
     def __call__(
         self,
@@ -181,16 +192,20 @@ class InferenceEngine(Artifact):
     ) -> Union[ListWithMetadata[str], ListWithMetadata[TextGenerationInferenceOutput]]:
         return self.infer(dataset=dataset, return_meta_data=return_meta_data)
-    def infer(
-        self,
-        dataset: Union[List[Dict[str, Any]], Dataset],
-        return_meta_data: bool = False,
-    ) -> Union[ListWithMetadata[str], ListWithMetadata[TextGenerationInferenceOutput]]:
-        """Verifies instances of a dataset and perform inference on the input dataset.
-        If return_meta_data - returns a list of TextGenerationInferenceOutput, else returns a list of the string
-        predictions.
-        """
         if not isoftype(dataset, Union[List[Dict[str, Any]], Dataset]):
             raise Exception(
                 "Dataset passed to infer() is not list of dictionaries or Huggingface Dataset"
@@ -202,10 +217,54 @@ class InferenceEngine(Artifact):
             )
         [self.verify_instance(instance) for instance in dataset]
         if settings.mock_inference_mode:
             result = self._mock_infer(dataset)
         else:
-            result = self._infer(dataset, return_meta_data)
         return ListWithMetadata(
             result,
             metadata={
@@ -221,6 +280,7 @@ class InferenceEngine(Artifact):
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
         return [str(instance["source"]) for instance in dataset]
     def get_engine_id(self):
         raise NotImplementedError()
@@ -918,16 +978,18 @@ class HFPipelineBasedInferenceEngine(
         return args
     def _create_pipeline(self, model_args: Dict[str, Any]):
-        from transformers import pipeline
         path = self.model_name
         if settings.hf_offline_models_path is not None:
             path = os.path.join(settings.hf_offline_models_path, path)
         self.model = pipeline(
             model=path,
             task=self.task,
             use_fast=self.use_fast_tokenizer,
             trust_remote_code=settings.allow_unverified_code,
             **model_args,
             **self.to_dict(
@@ -1302,7 +1364,7 @@ class IbmGenAiInferenceEngine(
     def _get_credentials():
         from genai import Credentials
-        api_key_env_var_name = "GENAI_KEY"
         api_key = os.environ.get(api_key_env_var_name)
         assert api_key is not None, (
@@ -1718,7 +1780,7 @@ class AzureOpenAIInferenceEngine(OpenAiInferenceEngine):
         ), "Error while trying to run AzureOpenAIInferenceEngine: Missing environment variable param AZURE_OPENAI_HOST or OPENAI_API_VERSION"
         api_url = f"{azure_openapi_host}/openai/deployments/{self.model_name}/chat/completions?api-version={api_version}"
-        return {"api_key": api_key, "api_url": api_url}
     def create_client(self):
         from openai import AzureOpenAI
@@ -1727,12 +1789,13 @@ class AzureOpenAIInferenceEngine(OpenAiInferenceEngine):
         return AzureOpenAI(
             api_key=self.credentials["api_key"],
             base_url=self.credentials["api_url"],
             default_headers=self.get_default_headers(),
         )
 class VLLMRemoteInferenceEngine(OpenAiInferenceEngine):
-    label: str = "vllm"
 class RITSInferenceEngine(
@@ -1741,6 +1804,10 @@ class RITSInferenceEngine(
     label: str = "rits"
     data_classification_policy = ["public", "proprietary"]
     def get_default_headers(self):
         return {"RITS_API_KEY": self.credentials["api_key"]}
@@ -1761,8 +1828,10 @@ class RITSInferenceEngine(
             RITSInferenceEngine._get_model_name_for_endpoint(model_name)
         )
-    @staticmethod
-    def _get_model_name_for_endpoint(model_name: str):
         return (
             model_name.split("/")[-1]
             .lower()
@@ -1805,7 +1874,7 @@ class TogetherAiInferenceEngine(
         from together import Together
         from together.types.models import ModelType
-        api_key_env_var_name = "TOGETHER_API_KEY"
         api_key = os.environ.get(api_key_env_var_name)
         assert api_key is not None, (
             f"Error while trying to run TogetherAiInferenceEngine."
@@ -1969,6 +2038,9 @@ class WMLInferenceEngineBase(
         deployment_id (str, optional):
             Deployment ID of a tuned model to be used for
             inference. Mutually exclusive with 'model_name'.
         parameters (Union[WMLInferenceEngineParams, WMLGenerationParamsMixin, WMLChatParamsMixin], optional):
             Defines inference parameters and their values. Deprecated attribute, please pass respective
             parameters directly to the respective class instead.
@@ -1977,6 +2049,7 @@ class WMLInferenceEngineBase(
     credentials: Optional[CredentialsWML] = None
     model_name: Optional[str] = None
     deployment_id: Optional[str] = None
     label: str = "wml"
     _requirements_list = {
         "ibm_watsonx_ai": "Install ibm-watsonx-ai package using 'pip install --upgrade ibm-watsonx-ai'. "
@@ -2230,11 +2303,6 @@ class WMLInferenceEngineGeneration(WMLInferenceEngineBase, WMLGenerationParamsMi
     If you want to include images in your input, please use 'WMLInferenceEngineChat' instead.
-    Args:
-        concurrency_limit (int):
-            Number of concurrent requests sent to a model. Default is 10,
-            which is also the maximum value.
     Examples:
         .. code-block:: python
@@ -2258,8 +2326,6 @@ class WMLInferenceEngineGeneration(WMLInferenceEngineBase, WMLGenerationParamsMi
             results = wml_inference.infer(dataset["test"])
     """
-    concurrency_limit: int = 10
     def verify(self):
         super().verify()
@@ -2511,6 +2577,32 @@ class WMLInferenceEngineChat(WMLInferenceEngineBase, WMLChatParamsMixin):
         # images as SDK allows sending only one image per message.
         return [messages]
     def _send_requests(
         self,
         dataset: Union[List[Dict[str, Any]], Dataset],
@@ -2526,27 +2618,25 @@ class WMLInferenceEngineChat(WMLInferenceEngineBase, WMLChatParamsMixin):
             output_type = "message"
             params["logprobs"] = False
-        final_results = []
-        for instance in dataset:
-            messages = self.to_messages(instance)
-            for message in messages:
-                result = self._model.chat(
-                    messages=message,
-                    params=params,
-                )
-                final_results.append(
-                    self.get_return_object(
-                        result["choices"][0][output_type]["content"],
-                        result,
-                        instance["source"],
-                        return_meta_data,
-                    )
-                )
-        return final_results
     def get_return_object(self, predict_result, result, input_text, return_meta_data):
         if return_meta_data:
@@ -2614,6 +2704,7 @@ def get_text_without_images(instance, image_token="<image>"):
 class LMMSEvalBaseInferenceEngine(
     InferenceEngine, PackageRequirementsMixin, LazyLoadMixin, TorchDeviceMixin
 ):
     model_type: str
     model_args: Dict[str, str]
     batch_size: int = 1
@@ -2623,6 +2714,9 @@ class LMMSEvalBaseInferenceEngine(
         "lmms_eval": "Install llms-eval package using 'pip install lmms-eval==0.2.4'",
     }
     def prepare_engine(self):
         if not self.lazy_load:
             self._prepare_engine()
@@ -2798,6 +2892,11 @@ class VLLMParamsMixin(Artifact):
 class VLLMInferenceEngine(InferenceEngine, PackageRequirementsMixin, VLLMParamsMixin):
     def prepare_engine(self):
         args = self.to_dict([VLLMParamsMixin])
         args.pop("model")
@@ -2883,6 +2982,9 @@ class LiteLLMInferenceEngine(
     _requirements_list: list = ["litellm", "tenacity", "tqdm", "diskcache"]
     def prepare_engine(self):
         # Initialize the token bucket rate limiter
         self._rate_limiter = AsyncTokenBucket(
@@ -2890,15 +2992,12 @@ class LiteLLMInferenceEngine(
             capacity=self.max_requests_per_second,
         )
         self.inference_type = "litellm"
-        import litellm
         from litellm import acompletion
-        from litellm.caching.caching import Cache
-        litellm.cache = Cache(type="disk")
         self._completion = acompletion
         # Initialize a semaphore to limit concurrency
-        self._semaphore = asyncio.Semaphore(self.max_requests_per_second)
     async def _infer_instance(
         self, index: int, instance: Dict[str, Any]
@@ -3010,28 +3109,34 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
     provider_specific_args: Optional[Dict[str, Dict[str,str]]] = None
     provider_model_map: Dict[_supported_apis, Dict[str, str]] = {
-        "watsonx": {
-            "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct",
-            "llama-3-70b-instruct": "watsonx/meta-llama/llama-3-70b-instruct",
-            "llama-3-1-70b-instruct": "watsonx/meta-llama/llama-3-1-70b-instruct",
-            "llama-3-3-70b-instruct": "watsonx/meta-llama/llama-3-3-70b-instruct",
-            "granite-3-8b-instruct": "watsonx/ibm/granite-3-8b-instruct",
-            "flan-t5-xxl": "watsonx/google/flan-t5-xxl",
-            "llama-3-2-1b-instruct": "watsonx/meta-llama/llama-3-2-1b-instruct",
-            "llama-3-2-11b-vision-instruct": "watsonx/meta-llama/llama-3-2-11b-vision-instruct",
-            "llama-3-2-90b-vision-instruct": "watsonx/meta-llama/llama-3-2-90b-vision-instruct",
-            "mistral-large-instruct": "watsonx/mistralai/mistral-large",
-        },
-        "watsonx-sdk": {
-            "llama-3-2-11b-vision-instruct": "meta-llama/llama-3-2-11b-vision-instruct",
-            "llama-3-8b-instruct": "meta-llama/llama-3-8b-instruct",
-            "llama-3-70b-instruct": "meta-llama/llama-3-70b-instruct",
             "granite-3-8b-instruct": "ibm/granite-3-8b-instruct",
         },
         "together-ai": {
             "llama-3-8b-instruct": "together_ai/meta-llama/Llama-3-8b-chat-hf",
             "llama-3-70b-instruct": "together_ai/meta-llama/Llama-3-70b-chat-hf",
             "llama-3-2-1b-instruct": "together_ai/togethercomputer/llama-3-2-1b-instruct",
         },
         "aws": {
             "llama-3-8b-instruct": "bedrock/meta.llama3-8b-instruct-v1:0",
@@ -3040,6 +3145,12 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
         "ollama": {
             "llama-3-8b-instruct": "llama3:8b",
             "llama-3-70b-instruct": "llama3:70b",
         },
         "bam": {
             "granite-3-8b-instruct": "ibm/granite-8b-instruct-preview-4k",
@@ -3049,12 +3160,14 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
         },
         "rits": {
             "granite-3-8b-instruct": "ibm-granite/granite-3.0-8b-instruct",
             "llama-3-1-8b-instruct": "meta-llama/llama-3-1-8b-instruct",
             "llama-3-1-70b-instruct": "meta-llama/llama-3-1-70b-instruct",
             "llama-3-2-11b-vision-instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct",
             "llama-3-2-90b-vision-instruct": "meta-llama/Llama-3.2-90B-Vision-Instruct",
             "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
-            "llama-3-1-405b-instruct-fp8": "meta-llama/llama-3-1-405b-instruct-fp8",
             "mistral-large-instruct": "mistralai/mistral-large-instruct-2407",
             "mixtral-8x7b-instruct": "mistralai/mixtral-8x7B-instruct-v0.1",
         },
@@ -3089,6 +3202,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "o1-preview": "azure/o1-preview",
             "gpt-4o-mini": "azure/gpt-4o-mini",
             "gpt-4o": "azure/gpt-4o",
             "gpt-4": "azure/gpt-4",
             "gpt-4-0314": "azure/gpt-4-0314",
             "gpt-4-0613": "azure/gpt-4-0613",
@@ -3133,6 +3247,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "mixtral-8x7b-instruct-v0.1": "replicate/mistralai/mixtral-8x7b-instruct-v0.1",
         },
     }
     _provider_to_base_class = {
         "watsonx": LiteLLMInferenceEngine,
@@ -3190,7 +3305,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
                         del args[param]
                 else:
                     del args[param]
-        self.engine = cls(**args)
         self.data_classification_policy = self.engine.data_classification_policy
     def _infer(
@@ -3210,7 +3325,7 @@ class HFOptionSelectingInferenceEngine(InferenceEngine, TorchDeviceMixin):
     This class uses models from the HuggingFace Transformers library to calculate log probabilities for text inputs.
     """
     model_name: str
     batch_size: int
@@ -3218,6 +3333,9 @@ class HFOptionSelectingInferenceEngine(InferenceEngine, TorchDeviceMixin):
         "transformers": "Install huggingface package using 'pip install --upgrade transformers"
     }
     def prepare_engine(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer

 import asyncio
 import base64
 import dataclasses
+import hashlib
 import io
 import json
 import logging
 import uuid
 from collections import Counter
 from datetime import datetime
+from itertools import islice
 from multiprocessing.pool import ThreadPool
 from typing import (
     Any,
 )
 from datasets import Dataset, DatasetDict, Image
+from diskcache import Cache
 from tqdm import tqdm, trange
 from tqdm.asyncio import tqdm_asyncio
 logger = get_logger()
+def batched(lst, n):
+    it = iter(lst)
+    while batch := list(islice(it, n)):
+        yield batch
 class StandardAPIParamsMixin(Artifact):
     model: str
     frequency_penalty: Optional[float] = None
 class InferenceEngine(Artifact):
     """Abstract base class for inference."""
+    cache_batch_size: int = 100
+    use_cache: bool = True
     @abc.abstractmethod
     def _infer(
         if not settings.mock_inference_mode:
             super().prepare()  # no need to prepare a mock
             self.prepare_engine()
+            self._cache = Cache(get_settings().inference_engine_cache_path + self.__class__.__name__)
     def __call__(
         self,
     ) -> Union[ListWithMetadata[str], ListWithMetadata[TextGenerationInferenceOutput]]:
         return self.infer(dataset=dataset, return_meta_data=return_meta_data)
+    def get_instance_cache_key(self, instance):
+        instance_key_fields = ["media", "source", "task_data"]
+        return {key: instance[key] for key in instance if key in instance_key_fields}
+    def _get_cache_key(self, instance: Dict[str, Any]) -> str:
+        """Generate a unique cache key for each input."""
+        record = self.get_instance_cache_key(instance)
+        record.update(self.to_dict())
+        instance_str = json.dumps(record, sort_keys=True)
+        return hashlib.md5(instance_str.encode()).hexdigest()
+    def verify_infer_inputs(self,
+                            dataset: Union[List[Dict[str, Any]], Dataset],
+                            return_meta_data: bool):
         if not isoftype(dataset, Union[List[Dict[str, Any]], Dataset]):
             raise Exception(
                 "Dataset passed to infer() is not list of dictionaries or Huggingface Dataset"
             )
         [self.verify_instance(instance) for instance in dataset]
+    def infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], Dataset],
+        return_meta_data: bool = False,
+    ) -> Union[ListWithMetadata[str], ListWithMetadata[TextGenerationInferenceOutput]]:
+        """Verifies instances of a dataset and perform inference on the input dataset.
+        If return_meta_data - returns a list of TextGenerationInferenceOutput, else returns a list of the string
+        predictions.
+        """
+        self.verify_infer_inputs(dataset, return_meta_data)
         if settings.mock_inference_mode:
             result = self._mock_infer(dataset)
         else:
+            if self.use_cache:
+                number_of_batches = len(dataset) // self.cache_batch_size + 1
+                result = []
+                for batch_index, batch in enumerate(batched(dataset, self.cache_batch_size)):
+                    cached_results = []
+                    missing_examples = []
+                    for i, item in enumerate(batch):
+                        cache_key = self._get_cache_key(item)
+                        cached_value = self._cache.get(cache_key)
+                        if cached_value is not None:
+                            cached_results.append((i, cached_value)) # each element is index in batch, and value
+                        else:
+                            missing_examples.append((i, item)) # each element is index in batch and example
+                    # infare on missing examples only, without indices
+                    logger.info(f"Inferring batch {batch_index + 1} / {number_of_batches} with {len(missing_examples)} instances (found {len(cached_results)} instances in {self._cache.directory})")
+                    if (len(missing_examples) > 0):
+                        inferred_results = self._infer([e[1] for e in missing_examples], return_meta_data)
+                        # recombined to index and value
+                        inferred_results = list(zip([e[0] for e in missing_examples], inferred_results))
+                        # Add missing examples to cache
+                        for (_, item), (_, prediction) in zip(missing_examples, inferred_results):
+                            if prediction is None:
+                                continue
+                            cache_key = self._get_cache_key(item)
+                            self._cache[cache_key] = prediction
+                    else:
+                        inferred_results=[]
+                    # Combine cached and inferred results in original order
+                    batch_predictions = [p[1] for p in sorted(cached_results + inferred_results)]
+                    result.extend(batch_predictions)
+            else:
+                result = self._infer(dataset, return_meta_data)
         return ListWithMetadata(
             result,
             metadata={
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
         return [str(instance["source"]) for instance in dataset]
+    @abc.abstractmethod
     def get_engine_id(self):
         raise NotImplementedError()
         return args
     def _create_pipeline(self, model_args: Dict[str, Any]):
+        from transformers import AutoTokenizer, pipeline
         path = self.model_name
         if settings.hf_offline_models_path is not None:
             path = os.path.join(settings.hf_offline_models_path, path)
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
         self.model = pipeline(
             model=path,
             task=self.task,
             use_fast=self.use_fast_tokenizer,
+            tokenizer=tokenizer,
             trust_remote_code=settings.allow_unverified_code,
             **model_args,
             **self.to_dict(
     def _get_credentials():
         from genai import Credentials
+        api_key_env_var_name = "GENAI_KEY" # pragma: allowlist secret
         api_key = os.environ.get(api_key_env_var_name)
         assert api_key is not None, (
         ), "Error while trying to run AzureOpenAIInferenceEngine: Missing environment variable param AZURE_OPENAI_HOST or OPENAI_API_VERSION"
         api_url = f"{azure_openapi_host}/openai/deployments/{self.model_name}/chat/completions?api-version={api_version}"
+        return {"api_key": api_key, "api_url": api_url, "api_version": api_version}
     def create_client(self):
         from openai import AzureOpenAI
         return AzureOpenAI(
             api_key=self.credentials["api_key"],
             base_url=self.credentials["api_url"],
+            api_version=self.credentials["api_version"],
             default_headers=self.get_default_headers(),
         )
 class VLLMRemoteInferenceEngine(OpenAiInferenceEngine):
+    label: str = "vllm-remote"
 class RITSInferenceEngine(
     label: str = "rits"
     data_classification_policy = ["public", "proprietary"]
+    model_names_dict = {
+        "microsoft/phi-4": "microsoft-phi-4"
+    }
     def get_default_headers(self):
         return {"RITS_API_KEY": self.credentials["api_key"]}
             RITSInferenceEngine._get_model_name_for_endpoint(model_name)
         )
+    @classmethod
+    def _get_model_name_for_endpoint(cls, model_name: str):
+        if model_name in cls.model_names_dict:
+            return cls.model_names_dict[model_name]
         return (
             model_name.split("/")[-1]
             .lower()
         from together import Together
         from together.types.models import ModelType
+        api_key_env_var_name = "TOGETHER_API_KEY" # pragma: allowlist secret
         api_key = os.environ.get(api_key_env_var_name)
         assert api_key is not None, (
             f"Error while trying to run TogetherAiInferenceEngine."
         deployment_id (str, optional):
             Deployment ID of a tuned model to be used for
             inference. Mutually exclusive with 'model_name'.
+        concurrency_limit (int):
+            Number of concurrent requests sent to a model. Default is 10,
+            which is also the maximum value for the generation.
         parameters (Union[WMLInferenceEngineParams, WMLGenerationParamsMixin, WMLChatParamsMixin], optional):
             Defines inference parameters and their values. Deprecated attribute, please pass respective
             parameters directly to the respective class instead.
     credentials: Optional[CredentialsWML] = None
     model_name: Optional[str] = None
     deployment_id: Optional[str] = None
+    concurrency_limit: int = 10
     label: str = "wml"
     _requirements_list = {
         "ibm_watsonx_ai": "Install ibm-watsonx-ai package using 'pip install --upgrade ibm-watsonx-ai'. "
     If you want to include images in your input, please use 'WMLInferenceEngineChat' instead.
     Examples:
         .. code-block:: python
             results = wml_inference.infer(dataset["test"])
     """
     def verify(self):
         super().verify()
         # images as SDK allows sending only one image per message.
         return [messages]
+    def _handle_async_requests(
+        self,
+        messages: List[List[Dict[str, Any]]],
+        params: Dict[str, Any],
+    ) -> List[Dict[str, Any]]:
+        async def handle_async_requests(start_idx, end_idx):
+            coroutines = [
+                self._model.achat(messages=messages[idx], params=params)
+                for idx in range(start_idx, end_idx)
+            ]
+            batch_results = await asyncio.gather(*coroutines)
+            return list(batch_results)
+        loop = asyncio.get_event_loop()
+        results = []
+        for batch_idx in range(0, len(messages), self.concurrency_limit):
+            batch_results = loop.run_until_complete(
+                handle_async_requests(
+                    batch_idx, min(batch_idx + self.concurrency_limit, len(messages))
+                )
+            )
+            results.extend(batch_results)
+        return results
     def _send_requests(
         self,
         dataset: Union[List[Dict[str, Any]], Dataset],
             output_type = "message"
             params["logprobs"] = False
+        indexed_messages = [
+            (i, message)
+            for i in range(len(dataset))
+            for message in self.to_messages(dataset[i])
+        ]
+        results = self._handle_async_requests(
+            [msg[1] for msg in indexed_messages], params
+        )
+        return [
+            self.get_return_object(
+                result["choices"][0][output_type]["content"],
+                result,
+                dataset[idx[0]]["source"],
+                return_meta_data,
+            )
+            for result, idx in zip(results, indexed_messages)
+        ]
     def get_return_object(self, predict_result, result, input_text, return_meta_data):
         if return_meta_data:
 class LMMSEvalBaseInferenceEngine(
     InferenceEngine, PackageRequirementsMixin, LazyLoadMixin, TorchDeviceMixin
 ):
+    label = "lmms-eval"
     model_type: str
     model_args: Dict[str, str]
     batch_size: int = 1
         "lmms_eval": "Install llms-eval package using 'pip install lmms-eval==0.2.4'",
     }
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model_type, self.label)
     def prepare_engine(self):
         if not self.lazy_load:
             self._prepare_engine()
 class VLLMInferenceEngine(InferenceEngine, PackageRequirementsMixin, VLLMParamsMixin):
+    label="vllm"
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model, self.label)
     def prepare_engine(self):
         args = self.to_dict([VLLMParamsMixin])
         args.pop("model")
     _requirements_list: list = ["litellm", "tenacity", "tqdm", "diskcache"]
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model, self.label)
     def prepare_engine(self):
         # Initialize the token bucket rate limiter
         self._rate_limiter = AsyncTokenBucket(
             capacity=self.max_requests_per_second,
         )
         self.inference_type = "litellm"
         from litellm import acompletion
         self._completion = acompletion
         # Initialize a semaphore to limit concurrency
+        self._semaphore = asyncio.Semaphore(round(self.max_requests_per_second))
     async def _infer_instance(
         self, index: int, instance: Dict[str, Any]
     provider_specific_args: Optional[Dict[str, Dict[str,str]]] = None
     provider_model_map: Dict[_supported_apis, Dict[str, str]] = {
+        "watsonx-sdk": { # checked from ibm_watsonx_ai.APIClient().foundation_models.ChatModels
+            "granite-20b-code-instruct": "ibm/granite-20b-code-instruct",
+            "granite-3-2-8b-instruct": "ibm/granite-3-2-8b-instruct",
+            "granite-3-2b-instruct": "ibm/granite-3-2b-instruct",
             "granite-3-8b-instruct": "ibm/granite-3-8b-instruct",
+            "granite-34b-code-instruct": "ibm/granite-34b-code-instruct",
+            "granite-guardian-3-8b": "ibm/granite-guardian-3-8b",
+            "granite-vision-3-2-2b": "ibm/granite-vision-3-2-2b",
+            "llama-3-1-8b-instruct": "meta-llama/llama-3-1-8b-instruct",
+            "llama-3-1-70b-instruct": "meta-llama/llama-3-1-70b-instruct",
+            "llama-3-1-405b-instruct": "meta-llama/llama-3-405b-instruct",
+            "llama-3-2-11b-vision-instruct": "meta-llama/llama-3-2-11b-vision-instruct",
+            "llama-3-2-1b-instruct": "meta-llama/llama-3-2-1b-instruct",
+            "llama-3-2-3b-instruct": "meta-llama/llama-3-2-3b-instruct",
+            "llama-3-2-90b-vision-instruct": "meta-llama/llama-3-2-90b-vision-instruct",
+            "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
+            "llama-guard-3-11b-vision": "meta-llama/llama-guard-3-11b-vision",
+            "mistral-large-instruct": "mistralai/mistral-large",
+            "mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7b-instruct-v01",
         },
         "together-ai": {
             "llama-3-8b-instruct": "together_ai/meta-llama/Llama-3-8b-chat-hf",
             "llama-3-70b-instruct": "together_ai/meta-llama/Llama-3-70b-chat-hf",
+            "llama-3-1-8b-instruct": "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+            "llama-3-1-70b-instruct": "together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+            "llama-3-1-405b-instruct": "together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
             "llama-3-2-1b-instruct": "together_ai/togethercomputer/llama-3-2-1b-instruct",
+            "llama-3-3-70b-instruct": "together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo"
         },
         "aws": {
             "llama-3-8b-instruct": "bedrock/meta.llama3-8b-instruct-v1:0",
         "ollama": {
             "llama-3-8b-instruct": "llama3:8b",
             "llama-3-70b-instruct": "llama3:70b",
+            "llama-3-1-8b-instruct": "llama3.1:8b",
+            "llama-3-1-70b-instruct": "llama3.1:70b",
+            "llama-3-1-405b-instruct": "llama3.1:405b",
+            "llama-3-2-1b-instruct": "llama3.2:1b",
+            "llama-3-2-3b-instruct": "llama3.2:3b",
+            "llama-3-3-70b-instruct": "llama3.3"
         },
         "bam": {
             "granite-3-8b-instruct": "ibm/granite-8b-instruct-preview-4k",
         },
         "rits": {
             "granite-3-8b-instruct": "ibm-granite/granite-3.0-8b-instruct",
+            "granite-3-2-8b-instruct": "ibm-granite/granite-3.2-8b-instruct",
             "llama-3-1-8b-instruct": "meta-llama/llama-3-1-8b-instruct",
             "llama-3-1-70b-instruct": "meta-llama/llama-3-1-70b-instruct",
+            "llama-3-1-405b-instruct": "meta-llama/llama-3-1-405b-instruct-fp8",
+            "llama-3-1-405b-instruct-fp8": "meta-llama/llama-3-1-405b-instruct-fp8",
             "llama-3-2-11b-vision-instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct",
             "llama-3-2-90b-vision-instruct": "meta-llama/Llama-3.2-90B-Vision-Instruct",
             "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
             "mistral-large-instruct": "mistralai/mistral-large-instruct-2407",
             "mixtral-8x7b-instruct": "mistralai/mixtral-8x7B-instruct-v0.1",
         },
             "o1-preview": "azure/o1-preview",
             "gpt-4o-mini": "azure/gpt-4o-mini",
             "gpt-4o": "azure/gpt-4o",
+            "gpt-4o-2024-08-06": "azure/gpt-4o-2024-08-06",
             "gpt-4": "azure/gpt-4",
             "gpt-4-0314": "azure/gpt-4-0314",
             "gpt-4-0613": "azure/gpt-4-0613",
             "mixtral-8x7b-instruct-v0.1": "replicate/mistralai/mixtral-8x7b-instruct-v0.1",
         },
     }
+    provider_model_map["watsonx"] = {k: f"watsonx/{v}" for k,v in provider_model_map["watsonx-sdk"].items()}
     _provider_to_base_class = {
         "watsonx": LiteLLMInferenceEngine,
                         del args[param]
                 else:
                     del args[param]
+        self.engine: InferenceEngine = cls(**args)
         self.data_classification_policy = self.engine.data_classification_policy
     def _infer(
     This class uses models from the HuggingFace Transformers library to calculate log probabilities for text inputs.
     """
+    label = "hf_option_selection"
     model_name: str
     batch_size: int
         "transformers": "Install huggingface package using 'pip install --upgrade transformers"
     }
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model_name, self.label)
     def prepare_engine(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer

llm_as_judge.py CHANGED Viewed

@@ -8,15 +8,12 @@ from .dict_utils import dict_get
 from .error_utils import UnitxtError
 from .inference import (
     InferenceEngine,
-    OptionSelectingByLogProbsInferenceEngine,
 )
 from .llm_as_judge_chat_templates import direct_template_dict, pairwise_template_dict
 from .llm_as_judge_constants import (
     DIRECT_CRITERIA,
     EVALUATOR_TO_MODEL_ID,
     EVALUATORS_METADATA,
-    INFERENCE_ENGINE_NAME_TO_CLASS,
-    MODEL_RENAMINGS,
     PAIRWISE_CRITERIA,
     Criteria,
     CriteriaOption,
@@ -44,30 +41,50 @@ from .llm_as_judge_utils import (
     get_evaluator_metadata,
     get_parsed_context,
     rank_indexes,
-    rename_model_if_required,
 )
 from .logging_utils import get_logger
 from .metrics import BulkInstanceMetric
 from .task import Task
 from .templates import Template
 class LLMJudge(BulkInstanceMetric):
     inference_engine: InferenceEngine
-    # option_selection_strategy: OptionSelectionStrategyEnum = (
-    #     OptionSelectionStrategyEnum.PARSE_OUTPUT_TEXT
-    # )
     evaluator_name: EvaluatorNameEnum = None
     check_positional_bias: bool = True
     context_fields: Union[str, List[str], Dict[str, str]] = ["context"]
-    generate_summaries: bool = True
-    format = "formats.chat_api"
-    include_prompts_in_result: bool = False
     criteria_field: str = None
     criteria: Criteria = None
-    logger = get_logger()
     def prepare(self):
         super().prepare()
         if isinstance(self.context_fields, str):
             self.context_fields = [self.context_fields]
@@ -78,10 +95,13 @@ class LLMJudge(BulkInstanceMetric):
         if self.evaluator_name is None:
             self.evaluator_name = self.inference_engine.get_engine_id()
-        elif not isinstance(self.evaluator_name, EvaluatorNameEnum):
-            self.evaluator_name = EvaluatorNameEnum[self.evaluator_name]
     def before_process_multi_stream(self):
         super().before_process_multi_stream()
         # We check the criteria here and not in verify(), because we want catalog
         # may contain a partially initialized object, and verify() method
@@ -93,6 +113,14 @@ class LLMJudge(BulkInstanceMetric):
         return
     def get_contexts(self, task_data: List[Dict[str, Any]]) -> List[Dict[str, str]]:
         return [
             get_parsed_context(
                 {
@@ -110,6 +138,17 @@ class LLMJudge(BulkInstanceMetric):
         template: Template,
         previous_messages: Optional[List[Dict[str, str]]] = None,
     ):
         outputs_dataset = infer(
             instances,
             task=task,
@@ -129,6 +168,14 @@ class LLMJudge(BulkInstanceMetric):
         return (prompts, raw_predictions, predictions)
     def clean_results(self, results: Union[dict, list]):
         if isinstance(results, list):
             return [self.clean_results(x) for x in results]
         cleaned = {
@@ -143,13 +190,25 @@ class LLMJudge(BulkInstanceMetric):
             if not (isinstance(v, dict) and len(v) == 0)
         }
-    def get_criterias(self, task_data, eval_count):
         if self.criteria is None:
             if self.criteria_field not in task_data[0]:
                 raise UnitxtError(
                     f"The criteria field `{self.criteria_field}` required for {__class__.__name__} is not found in instance.  Perhaps you meant '{get_close_matches(self.criteria_field, task_data[0].keys(), n=1, cutoff=0.0)[0]}'?"
                 )
-            self.logger.info(
                 f"Reading criteria from the task_data field '{self.criteria_field}'"
             )
             criterias = [
@@ -157,20 +216,31 @@ class LLMJudge(BulkInstanceMetric):
                 for task_data_instance in task_data
             ]
         else:
-            self.logger.info(
                 "Reading criteria from self. Criteria is a single CriteriaWithOptions, replicating it for all predictions"
             )
             criterias: List[Criteria] = [self.criteria] * eval_count
         unique_criteria_names = list({criteria.name for criteria in criterias})
-        self.logger.info(f"Criteria names are '{', '.join(unique_criteria_names)}'")
         return criterias
 class LLMJudgeDirect(LLMJudge):
     criteria: CriteriaWithOptions = None
     main_score = "llm_as_judge"
     reduction_map = {"mean": ["llm_as_judge"]}
     def prepare(self):
         super().prepare()
@@ -200,7 +270,7 @@ class LLMJudgeDirect(LLMJudge):
         self.option_selection_task = Task(
             input_fields={
                 "criteria_description": str,
-                "score_option_instruction": str,
                 "options": list,
             },
             reference_fields={},
@@ -209,6 +279,7 @@ class LLMJudgeDirect(LLMJudge):
         )
     def before_process_multi_stream(self):
         super().before_process_multi_stream()
         if self.criteria is not None and not isinstance(
             self.criteria, CriteriaWithOptions
@@ -218,34 +289,42 @@ class LLMJudgeDirect(LLMJudge):
             )
         return
-    def get_parsed_criteria(self, criteria: CriteriaWithOptions):
         criteria_description = criteria.description
         criteria_option_names = [o.name for o in criteria.options]
-        display_options_instruction = "Choose an answer:\n" + "\n".join(
             [
                 f'- "{o.name}"{f" if {o.description}" if o.description != "" else ""}'
                 for o in criteria.options
             ]
         )
-        score_option_instruction = "".join(
-            [f"Score {o.name}: {o.description}\n" for o in criteria.options]
-        )
         return (
             criteria_description,
             criteria_option_names,
             display_options_instruction,
-            score_option_instruction,
         )
-    def set_main_score(self, criterias: List[CriteriaWithOptions]):
         unique_criteria_names = list({criteria.name for criteria in criterias})
         if len(unique_criteria_names) == 1 and criterias[0].name != "":
             self.main_score = "_".join(criterias[0].name.lower().split(" "))
             self.reduction_map = {"mean": [self.main_score]}
-    def get_results(
         self,
         assessment_prompts,
         assessment_outputs,
@@ -289,6 +368,9 @@ class LLMJudgeDirect(LLMJudge):
                 "summary": summarization_outputs[i]
                 if self.generate_summaries
                 else None,
                 "prompts": {
                     "assessment": assessment_prompts[i],
                     "positional_bias_assessment": assessment_prompts[
@@ -332,14 +414,113 @@ class LLMJudgeDirect(LLMJudge):
         references: List[List[str]],
         predictions: List[str],
         task_data: List[Dict[str, Any]],
-    ) -> dict:
-        self.logger.info(
             f'Starting evaluation with evaluator "{self.evaluator_name}" and provider "{self.inference_engine.get_pretty_print_name()}'
         )
         evaluations_count = len(predictions)
         # TODO: find out how to serialize and deserialize enums
-        criterias = self.get_criterias(task_data, evaluations_count)
-        self.set_main_score(criterias)
         contexts = self.get_contexts(task_data)
         if self.check_positional_bias:
             criterias += [
@@ -355,14 +536,13 @@ class LLMJudgeDirect(LLMJudge):
             predictions += predictions
         parsed_criterias = [
-            self.get_parsed_criteria(criteria) for criteria in criterias
         ]
         (
             criteria_description_list,
             criteria_option_names_list,
             display_options_instruction_list,
-            score_option_instruction_list,
         ) = zip(*parsed_criterias)
         assessment_for_summaries_slice = slice(0, evaluations_count)
@@ -385,7 +565,7 @@ class LLMJudgeDirect(LLMJudge):
         assessment_prompts, assessment_outputs, _ = self.perform_evaluation_step(
             assessment_instances, self.assessment_task, self.assessment_template
         )
-        self.logger.info("The assessment was generated successfully.")
         summarization_prompts = None
         summarization_outputs = None
@@ -409,18 +589,22 @@ class LLMJudgeDirect(LLMJudge):
                 self.summarization_task,
                 self.summarization_template,
             )
-            self.logger.info("The summary was generated successfully.")
         option_selection_instances = [
             {
                 "criteria_description": criteria_description,
-                "score_option_instruction": score_option_instruction,
                 "options": criteria_option_names,
                 "data_classification_policy": ["public"],
             }
-            for criteria_description, score_option_instruction, criteria_option_names in zip(
                 criteria_description_list,
-                score_option_instruction_list,
                 criteria_option_names_list,
             )
         ]
@@ -441,9 +625,9 @@ class LLMJudgeDirect(LLMJudge):
             self.option_selection_template,
             previous_messages,
         )
-        self.logger.info("The selections were calculated successfully.")
-        results = self.get_results(
             assessment_prompts,
             assessment_outputs,
             summarization_prompts,
@@ -454,15 +638,19 @@ class LLMJudgeDirect(LLMJudge):
             evaluations_count,
             criterias,
         )
         return self.clean_results(results)
 class LLMJudgePairwise(LLMJudge):
-    reduction_map = {"mean": ["score"]}
     main_score = "1_winrate"
-    prediction_type = List[str]
     def prepare(self):
         super().prepare()
         self.assessment_template = pairwise_template_dict["assessment"]
         self.summarization_template = pairwise_template_dict["summarization"]
@@ -501,6 +689,7 @@ class LLMJudgePairwise(LLMJudge):
         )
     def before_process_multi_stream(self):
         super().before_process_multi_stream()
         if self.criteria is not None and not isinstance(self.criteria, Criteria):
             raise Exception(
@@ -508,7 +697,7 @@ class LLMJudgePairwise(LLMJudge):
             )
         return
-    def get_instance_results(
         self,
         instance_predictions: Dict[str, str],
         assessment_prompts,
@@ -520,8 +709,26 @@ class LLMJudgePairwise(LLMJudge):
         selections,
         contests_count,
         combination_indexes,
-        criteria: Criteria,
     ):
         response_names = list(instance_predictions.keys())
         per_response_results = {
             response_key: {
@@ -680,32 +887,479 @@ class LLMJudgePairwise(LLMJudge):
             for metric in single_result.keys():
                 all_results[f"{response_name}_{metric}"] = single_result[metric]
-        all_results["criteria"] = criteria.to_json()
         return self.clean_results(all_results)
-    def parse_prediction_to_dict(self, prediction: Union[Dict[str, str], List[str]]):
-        if isinstance(prediction, list):
-            return {f"{key + 1}": value for key, value in enumerate(prediction)}
-        raise Exception(
-            f"Prediction may be a list or a dict. Instead got type {type(prediction)}"
         )
-    def convert_predictions_to_dicts(
         self, predictions: Union[List[Dict[str, str]], List[str]]
     ):
-        return [self.parse_prediction_to_dict(prediction) for prediction in predictions]
     def compute(
         self,
         references: List[List[str]],
         predictions: List[str],
         task_data: List[Dict[str, str]],
-    ) -> dict:
-        self.logger.info(
             f'Starting evaluation with evaluator "{self.evaluator_name}" and provider {self.inference_engine.get_pretty_print_name()}'
         )
-        predictions = self.convert_predictions_to_dicts(predictions)
         instances_count = len(predictions)
         self.reduction_map = {"mean": ["score"]}
         self.reduction_map["mean"].extend(
@@ -721,7 +1375,7 @@ class LLMJudgePairwise(LLMJudge):
             len(combination_indexes) for combination_indexes in combination_indexes_list
         ]
-        self.logger.info(
             f"The evaluation will perform {sum(contests_count_list) * [1, 2][self.check_positional_bias]} ({' + '.join([f'{c * [1, 2][self.check_positional_bias]}' for c in contests_count_list])}) pairwise comparisons"
         )
@@ -752,7 +1406,7 @@ class LLMJudgePairwise(LLMJudge):
             response_pairs_list.append(response_pairs)
             option_pairs_list.append(option_pairs)
-        criterias = self.get_criterias(task_data, instances_count)
         contexts = self.get_contexts(task_data)
         if self.check_positional_bias:
             criterias.extend(criterias)
@@ -786,7 +1440,7 @@ class LLMJudgePairwise(LLMJudge):
         assessment_prompts, assessment_outputs, _ = self.perform_evaluation_step(
             assessment_instances, self.assessment_task, self.assessment_template
         )
-        self.logger.info("The assessment was generated successfully.")
         # the slices used to get the assessment for each summary generation instance
         # it will grab the whole assessment for a particular instance or half of it depending on the value of check_positional_bias
@@ -836,7 +1490,7 @@ class LLMJudgePairwise(LLMJudge):
                 self.summarization_task,
                 self.summarization_template,
             )
-            self.logger.info("The summary was generated successfully.")
         score_option_instruction_list = [
             "".join(
@@ -884,7 +1538,7 @@ class LLMJudgePairwise(LLMJudge):
         )
         # Selections are of the form 'Response n', so we just keep n
         selections = [selection.split(" ")[-1] for selection in selections]
-        self.logger.info("The selections were calculated successfully.")
         results = []
         slice_start = 0
         for i, incremental_contests_count in enumerate(incremental_contests_count_list):
@@ -897,7 +1551,7 @@ class LLMJudgePairwise(LLMJudge):
                 (incremental_contests_count_list[i - 1] if i > 0 else 0)
                 + incremental_contests_count,
             )
-            instance_results = self.get_instance_results(
                 predictions[i],
                 assessment_prompts[sli],
                 assessment_outputs[sli],

 from .error_utils import UnitxtError
 from .inference import (
     InferenceEngine,
 )
 from .llm_as_judge_chat_templates import direct_template_dict, pairwise_template_dict
 from .llm_as_judge_constants import (
     DIRECT_CRITERIA,
     EVALUATOR_TO_MODEL_ID,
     EVALUATORS_METADATA,
     PAIRWISE_CRITERIA,
     Criteria,
     CriteriaOption,
     get_evaluator_metadata,
     get_parsed_context,
     rank_indexes,
 )
 from .logging_utils import get_logger
 from .metrics import BulkInstanceMetric
 from .task import Task
 from .templates import Template
+logger = get_logger(__name__)
 class LLMJudge(BulkInstanceMetric):
+    """A metric class to evaluate instances using LLM as a Judge.
+    Evaluations are performed in two steps. First, the LLM is asked to generate an assessment following a CoT approach based on the criteria. Then, the same LLM is asked to select one of the available options. A summary of the general assessment can be generated for easy consumption by end users.
+    """
     inference_engine: InferenceEngine
+    """The engine used for generating predictions in the different evaluation steps."""
     evaluator_name: EvaluatorNameEnum = None
+    """The name of the evaluator. It is used for score naming. If not provided `self.inference_engine.get_engine_id()` is used."""
     check_positional_bias: bool = True
+    """Flag to check for positional bias. Detecting for positional bias duplicates the amount of inference calls."""
     context_fields: Union[str, List[str], Dict[str, str]] = ["context"]
+    """Fields to be used as context. If a dict is provided, the keys are used as the final names in the prompts, while the values are used to access the context variable values in the `task_data` object."""
+    generate_summaries: bool = False
+    """Flag to generate summaries of the assessments. Defaults to `False`."""
+    format: str = "formats.chat_api"
+    """The format used for the inference. Defaults to `formats.chat_api` (only allowed value)."""
+    include_prompts_in_result: bool = True
+    """Flag to include prompts in the result. Defaults to `True`."""
     criteria_field: str = None
+    """The field specifying the evaluation criteria in the `task_data` object."""
     criteria: Criteria = None
+    """The criteria used for evaluation. If the `criteria_field` is provided, it will take precedence."""
     def prepare(self):
+        """Prepares the `LLMJudge` instance by setting up context fields and evaluator name."""
         super().prepare()
         if isinstance(self.context_fields, str):
             self.context_fields = [self.context_fields]
         if self.evaluator_name is None:
             self.evaluator_name = self.inference_engine.get_engine_id()
     def before_process_multi_stream(self):
+        """Checks the criteria-related fields correctness before processing multiple streams.
+        Raises:
+            UnitxtError: If both 'criteria' and 'criteria_field' are not set.
+        """
         super().before_process_multi_stream()
         # We check the criteria here and not in verify(), because we want catalog
         # may contain a partially initialized object, and verify() method
         return
     def get_contexts(self, task_data: List[Dict[str, Any]]) -> List[Dict[str, str]]:
+        """Extracts and parses context fields from task data.
+        Args:
+            task_data (List[Dict[str, Any]]): The task data containing context information.
+        Returns:
+            List[Dict[str, str]]: A list of parsed context dictionaries.
+        """
         return [
             get_parsed_context(
                 {
         template: Template,
         previous_messages: Optional[List[Dict[str, str]]] = None,
     ):
+        """Performs an evaluation step by generating predictions for the given instances.
+        Args:
+            instances (list): The list of instances to evaluate.
+            task (Task): The task associated with the instances.
+            template (Template): The template used for generating predictions.
+            previous_messages (Optional[List[Dict[str, str]]]): Previous messages for context.
+        Returns:
+            Tuple[List[str], List[str], List[str]]: A tuple containing prompts, raw predictions, and processed predictions. Raw predictions differ from processed predictions only in the completion step, where the processors.match_closest_option is used.
+        """
         outputs_dataset = infer(
             instances,
             task=task,
         return (prompts, raw_predictions, predictions)
     def clean_results(self, results: Union[dict, list]):
+        """Cleans the results by removing `None` values and empty lists and dictionaries.
+        Args:
+            results (Union[dict, list]): The results to clean.
+        Returns:
+            Union[dict, list]: The cleaned results.
+        """
         if isinstance(results, list):
             return [self.clean_results(x) for x in results]
         cleaned = {
             if not (isinstance(v, dict) and len(v) == 0)
         }
+    def get_criteria(self, task_data, eval_count):
+        """Retrieves the evaluation criteria from the `criteria_field` or from `self`.
+        Args:
+            task_data (List[Dict[str, Any]]): The task data containing criteria information.
+            eval_count (int): The number of evaluations to perform.
+        Returns:
+            List[Criteria]: A list of criteria for evaluation.
+        Raises:
+            UnitxtError: If the criteria field is not found in the task data.
+        """
         if self.criteria is None:
             if self.criteria_field not in task_data[0]:
                 raise UnitxtError(
                     f"The criteria field `{self.criteria_field}` required for {__class__.__name__} is not found in instance.  Perhaps you meant '{get_close_matches(self.criteria_field, task_data[0].keys(), n=1, cutoff=0.0)[0]}'?"
                 )
+            logger.info(
                 f"Reading criteria from the task_data field '{self.criteria_field}'"
             )
             criterias = [
                 for task_data_instance in task_data
             ]
         else:
+            logger.info(
                 "Reading criteria from self. Criteria is a single CriteriaWithOptions, replicating it for all predictions"
             )
             criterias: List[Criteria] = [self.criteria] * eval_count
         unique_criteria_names = list({criteria.name for criteria in criterias})
+        logger.info(f"Criteria names are '{', '.join(unique_criteria_names)}'")
         return criterias
 class LLMJudgeDirect(LLMJudge):
+    """LLMJudgeDirect is a specialized evaluation metric that performs Direct Assessment using an LLM to score responses based on a predefined evaluation criteria.
+    Direct Assessment is an evaluation paradigm in which the LLM selects one of a
+    predefined set of options based on an assessment criterion. This approach can
+    be used for Likert-scale scoring (e.g., 1-5) or selecting from semantically
+    conditioned literals (e.g., Yes/No, Pass/Fail).
+    """
     criteria: CriteriaWithOptions = None
+    """The evaluation criteria, including a name, description, a predefined set of options and and option_map."""
     main_score = "llm_as_judge"
+    """The primary score name used in the results. By default, it will take the value of the criteria name (if only one criteria is being used for evaluation) or "llm_as_judge" otherwise."""
     reduction_map = {"mean": ["llm_as_judge"]}
+    """A mapping used for score aggregation. By default, it will take the value of `{'mean': [<default_main_score_name>]}`."""
     def prepare(self):
         super().prepare()
         self.option_selection_task = Task(
             input_fields={
                 "criteria_description": str,
+                "display_options_instruction": str,
                 "options": list,
             },
             reference_fields={},
         )
     def before_process_multi_stream(self):
+        """Ensures that the criteria is of type `CriteriaWithOptions`, raising an exception otherwise."""
         super().before_process_multi_stream()
         if self.criteria is not None and not isinstance(
             self.criteria, CriteriaWithOptions
             )
         return
+    def __get_parsed_criteria(self, criteria: CriteriaWithOptions):
+        """Extracts key information from the given criteria.
+        Args:
+            criteria (CriteriaWithOptions): The evaluation criteria.
+        Returns:
+            Tuple[str, List[str], str, str]:
+            - Criteria description.
+            - List of option names.
+            - Formatted instruction for displaying options.
+            - Instruction for scoring options.
+        """
         criteria_description = criteria.description
         criteria_option_names = [o.name for o in criteria.options]
+        display_options_instruction = "Choose an option:\n" + "\n".join(
             [
                 f'- "{o.name}"{f" if {o.description}" if o.description != "" else ""}'
                 for o in criteria.options
             ]
         )
         return (
             criteria_description,
             criteria_option_names,
             display_options_instruction,
         )
+    def __set_main_score(self, criterias: List[CriteriaWithOptions]):
         unique_criteria_names = list({criteria.name for criteria in criterias})
         if len(unique_criteria_names) == 1 and criterias[0].name != "":
             self.main_score = "_".join(criterias[0].name.lower().split(" "))
             self.reduction_map = {"mean": [self.main_score]}
+    def __get_results(
         self,
         assessment_prompts,
         assessment_outputs,
                 "summary": summarization_outputs[i]
                 if self.generate_summaries
                 else None,
+                "positional_bias_summary": summarization_outputs[i]
+                if self.generate_summaries and self.check_positional_bias
+                else None,
                 "prompts": {
                     "assessment": assessment_prompts[i],
                     "positional_bias_assessment": assessment_prompts[
         references: List[List[str]],
         predictions: List[str],
         task_data: List[Dict[str, Any]],
+    ) -> List[Dict]:
+        r"""Performs direct assessment evaluation on the given predictions and references.
+        This method evaluates the quality of of the predictions by calculating scores for each instance based on a criterion.
+        Returns:
+        -------
+        List[Dict]
+            A list of dictionaries containing the evaluation results for each instance. The results include the computed scores for each prediction. Each result will have the `score_name` as a prefix, which may be the criterion name if only one used, or "llm_as_judge" if several criteria were used.
+            Explanation of fields:
+            - `score`: a float representing the evaluation score for the response. The value is calculated from criteria.option_map[selected_option].
+            - `using_<evaluator_name>`: Equal to score.
+            - `positional_bias`: Boolean indicating whether the assessment detected positional bias. Its final value is selected_option != positional_bias_selected_option
+            - `selected_option`: The criteria option that the evaluator chose (e.g., "Could be Improved"). It is calculated by processing `option_selection_completion` using `processors.match_closest_option`
+            - `positional_bias_selected_option`: The criteria option that the evaluator chose when checking positional bias.
+            - `assessment`: The inference engine's generated text using the `prompts.assessment` prompt.
+            - `positional_bias_assessment`: The inference engine's generated text using the `prompts.positional_bias_assessment` prompt.
+            - `summary`: An LLM-generated summary of the assessment.
+            - `positional_bias_summary`: A LLM-generated summary of the positional bias assessment.
+            - `prompts`: A dictionary of prompts used in different stages of evaluation.
+                - `assessment`: The prompt used to instruct the model on how to assess the response.
+                - `positional_bias_assessment`: The prompt used to instruct the model on how to assess the response in the positional bias check.
+                - `summarization`: The prompt used to generate summary of the assessment.
+                - `option_selection`: The prompt used to generate a final judgement.
+                - `positional_bias_option_selection`: The prompt used to generate a final judgement in the positional bias check.
+            - `option_selection_completion`: The inference engine's generated text using `prompts.option_selection`.
+            - `positional_bias_option_selection_completion`: The inference engine's generated text using `prompts.positional_bias_option_selection`.
+            - `criteria`: A JSON-like string representing the evaluation criteria's artifact.
+            Result example:
+            .. code-block:: python
+                [
+                    {
+                        "answer_relevance": 1,
+                        "answer_relevance_using_granite3.0-2b_litellm": 1,
+                        "answer_relevance_positional_bias": false,
+                        "answer_relevance_selected_option": "Could be Improved",
+                        "answer_relevance_positional_bias_selected_option": "Could be Improved",
+                        "answer_relevance_assessment": "To assess the quality of the response, l...",
+                        "answer_relevance_positional_bias_assessment": "To assess the quality of the response, l...",
+                        "answer_relevance_summary": "A response about apprenticeships during ...",
+                        "answer_relevance_positional_bias_summary": "A response about apprenticeships during ...",
+                        "answer_relevance_prompts": {
+                            "assessment": [
+                                {
+                                    "role": "user",
+                                    "content": "You are presented with a response gener..."
+                                }
+                            ],
+                            "positional_bias_assessment": [
+                                {
+                                    "role": "user",
+                                    "content": "You are presented with a response gener..."
+                                }
+                            ],
+                            "summarization": [
+                                {
+                                    "role": "user",
+                                    "content": "Transform the following assessment into ..."
+                                }
+                            ],
+                            "option_selection": [
+                                {
+                                    "content": "You are presented with a response gener...",
+                                    "role": "user"
+                                },
+                                {
+                                    "content": "To assess the quality of the response, l...",
+                                    "role": "assistant"
+                                },
+                                {
+                                    "content": "Now consider the evaluation criteria and...",
+                                    "role": "user"
+                                }
+                            ],
+                            "posional_bias_option_selection": [
+                                {
+                                    "content": "You are presented with a response gener...",
+                                    "role": "user"
+                                },
+                                {
+                                    "content": "To assess the quality of the response, l...",
+                                    "role": "assistant"
+                                },
+                                {
+                                    "content": "Now consider the evaluation criteria and...",
+                                    "role": "user"
+                                }
+                            ]
+                        },
+                        "answer_relevance_option_selection_completion": "Could be Improved",
+                        "answer_relevance_positional_bias_option_selection_completion": "Could be Improved",
+                        "answer_relevance_criteria": "{    \"__type__\": \"criteria_with_options..."
+                    }
+                ]
+        """
+        logger.info(
             f'Starting evaluation with evaluator "{self.evaluator_name}" and provider "{self.inference_engine.get_pretty_print_name()}'
         )
         evaluations_count = len(predictions)
         # TODO: find out how to serialize and deserialize enums
+        criterias = self.get_criteria(task_data, evaluations_count)
+        self.__set_main_score(criterias)
         contexts = self.get_contexts(task_data)
         if self.check_positional_bias:
             criterias += [
             predictions += predictions
         parsed_criterias = [
+            self.__get_parsed_criteria(criteria) for criteria in criterias
         ]
         (
             criteria_description_list,
             criteria_option_names_list,
             display_options_instruction_list,
         ) = zip(*parsed_criterias)
         assessment_for_summaries_slice = slice(0, evaluations_count)
         assessment_prompts, assessment_outputs, _ = self.perform_evaluation_step(
             assessment_instances, self.assessment_task, self.assessment_template
         )
+        logger.info("The assessment was generated successfully.")
         summarization_prompts = None
         summarization_outputs = None
                 self.summarization_task,
                 self.summarization_template,
             )
+            logger.info("The summary was generated successfully.")
         option_selection_instances = [
             {
                 "criteria_description": criteria_description,
+                "display_options_instruction": display_options_instruction,
                 "options": criteria_option_names,
                 "data_classification_policy": ["public"],
             }
+            for (
+                criteria_description,
+                display_options_instruction,
+                criteria_option_names
+            ) in zip(
                 criteria_description_list,
+                display_options_instruction_list,
                 criteria_option_names_list,
             )
         ]
             self.option_selection_template,
             previous_messages,
         )
+        logger.info("The selections were calculated successfully.")
+        results = self.__get_results(
             assessment_prompts,
             assessment_outputs,
             summarization_prompts,
             evaluations_count,
             criterias,
         )
         return self.clean_results(results)
 class LLMJudgePairwise(LLMJudge):
+    """A judge for pairwise comparison evaluations, where two or more responses are compared to determine which one is preferred based on a criterion."""
     main_score = "1_winrate"
+    """The main score metric for pairwise evaluation. By default, its value is `1_winrate`, and will take the value of the winrate of the first system."""
+    reduction_map = {"mean": ["score"]}
+    """A mapping specifying how scores should be reduced. By default, it will be `{'main': ['score']}`"""
     def prepare(self):
+        """Prepares the pairwise comparison by initializing the necessary templates and tasks. These tasks will be used to assess, summarize, and select options from candidate responses."""
         super().prepare()
         self.assessment_template = pairwise_template_dict["assessment"]
         self.summarization_template = pairwise_template_dict["summarization"]
         )
     def before_process_multi_stream(self):
+        """Verifies that the criteria is of the correct type before processing the multi-stream data."""
         super().before_process_multi_stream()
         if self.criteria is not None and not isinstance(self.criteria, Criteria):
             raise Exception(
             )
         return
+    def __get_instance_results(
         self,
         instance_predictions: Dict[str, str],
         assessment_prompts,
         selections,
         contests_count,
         combination_indexes,
+        criterion: Criteria,
     ):
+        """Computes the results for each instance by comparing the responses and calculating metrics such as winrate, ranking, and the responses overall performance. This method processes assessment, summarization, and option selection outputs to track contest results, positional bias, and winrate.
+        Args:
+            instance_predictions (Dict[str, str]): The predictions for each response.
+            assessment_prompts (List[str]): The prompts for the assessment task.
+            assessment_outputs (List[str]): The results from the assessment task.
+            summarization_prompts (List[str]): The prompts for the summarization task.
+            summarization_outputs (List[str]): The results from the summarization task.
+            option_selection_prompts (List[str]): The prompts for the option selection task.
+            option_selection_outputs (List[str]): The results from the option selection task.
+            selections (List[str]): The selections made during the pairwise comparison.
+            contests_count (int): The total number of contests that were run.
+            combination_indexes (List[Tuple[int, int]]): The indexes of the response pairs that were compared.
+            criterion (Criteria): The criterion used to assess the responses.
+        Returns:
+            dict: A dictionary containing the results for each response, including winrate, ranking, and other metrics.
+        """
         response_names = list(instance_predictions.keys())
         per_response_results = {
             response_key: {
             for metric in single_result.keys():
                 all_results[f"{response_name}_{metric}"] = single_result[metric]
+        all_results["criteria"] = criterion.to_json()
         return self.clean_results(all_results)
+    def __parse_prediction_to_dict(self, predictions: Union[Dict[str, str], List[str]]):
+        """Converts a list or dictionary of predictions into a dictionary format.
+        Args:
+            predictions (Union[Dict[str, str], List[str]]): The prediction data to convert.
+        Returns:
+            dict: The prediction data in dictionary format.
+        """
+        if isinstance(predictions, list):
+            return {f"{key + 1}": value for key, value in enumerate(predictions)}
+        if isinstance(predictions, dict):
+            return predictions
+        raise UnitxtError(
+            f"Prediction may be a list or a dict. Instead got type {type(predictions)}"
         )
+    def __convert_predictions_to_dicts(
         self, predictions: Union[List[Dict[str, str]], List[str]]
     ):
+        """Converts a list of predictions into a list of dictionaries.
+        Args:
+            predictions (Union[List[Dict[str, str]], List[str]]): The predictions to convert.
+        Returns:
+            List[dict]: A list of predictions in dictionary format.
+        """
+        return [self.__parse_prediction_to_dict(prediction) for prediction in predictions]
+    def __set_main_score(self, predictions: List[Dict[str, str]]):
+        self.main_score = f"{next(iter(predictions[0].keys()))}_winrate"
     def compute(
         self,
         references: List[List[str]],
         predictions: List[str],
         task_data: List[Dict[str, str]],
+    ) -> List[Dict]:
+        r"""Executes the pairwise comparison evaluation, including assessment, summarization, and option selection. It computes the winrate and ranking for the responses.
+        Args:
+            references (List[List[str]]): A list of reference responses for comparison.
+            predictions (List[str]): A list of predicted responses.
+            task_data (List[Dict[str, str]]): Task data to be used for evaluation.
+        Returns:
+        -------
+        List[Dict[str,Dict]]
+            The results of the evaluation, including winrate, ranking, and other metrics.
+            For each instance result, the following metrics are included per response/system. Each of the metrics will have appended the systems name, if predictions were provided as a list of dicts, or their index, starting from 1, if predictions were provided as a list of lists.
+            All the fields are arrays with length equal to `len(systems) - 1`. For any result at index `i`: `response_name[i]`'s contest against `compared_to[i]`'s result is `contest_results[i]`.
+            Explanation of fields:
+            - `summaries`: A list of LLM-generated summaries explaining the comparison results for each response.
+            - `contest_results`: A list of boolean values indicating whether the response won in each comparison.
+            - `selections`: A list of the selected system names, representing the preferred response in each comparison.
+            - `compared_to`: A list of system names that were compared against the given response.
+            - `assessments`: A list of LLM-generated assessments explaining the reasoning behind the evaluation results.
+            - `positional_bias_assessments`: A list of LLM-generated assessments focused on detecting positional bias in the evaluation.
+            - `option_selection_outputs`: A list of response names selected as the best choice based on the evaluation.
+            - `positional_bias`: A list of boolean values indicating whether positional bias was detected in the contest.
+            - `positional_bias_selection`: A list of response names representing the selected option when considering positional bias.
+            - `prompts`: A dictionary of prompts used in different stages of evaluation.
+                - `assessment`: The prompt used to instruct the model on how to assess the responses.
+                - `positional_bias_assessment`: The prompt used to instruct the model on how to assess positional bias.
+                - `option_selection`: The prompt used to guide the model in selecting the best response.
+                - `positional_bias_option_selection`: The prompt used for selecting the best response while checking for positional bias.
+                - `summary`: The prompt used to generate a summary of the assessment.
+            - `winrate`: A float representing the proportion of comparisons the response won.
+            - `llm_as_judge`: Equal to `winrate`.
+            - `ranking`: An integer representing the ranking position of the response based on the evaluation results. Best is 1.
+            - `response_name`: A string identifying the response in the evaluation.
+            Result example:
+            .. code-block:: python
+                    [
+                        {
+                            "system1_contest_results": [
+                                true,
+                                true
+                            ],
+                            "system1_selections": [
+                                "system1",
+                                "system1"
+                            ],
+                            "system1_compared_to": [
+                                "system2",
+                                "system3"
+                            ],
+                            "system1_assessments": [
+                                "To determine the better response accordi...",
+                                "To determine the better response accordi..."
+                            ],
+                            "system1_positional_bias_assessments": [
+                                "To determine the better response accordi...",
+                                "To determine the better response accordi..."
+                            ],
+                            "system1_option_selection_outputs": [
+                                "system1",
+                                "system1"
+                            ],
+                            "system1_positional_bias": [
+                                false,
+                                false
+                            ],
+                            "system1_positional_bias_selection": [
+                                "system1",
+                                "system1"
+                            ],
+                            "system1_prompts": {
+                                "assessment": [
+                                    [
+                                        {
+                                            "role": "user",
+                                            "content": "You are provided a pair of responses (Re..."
+                                        }
+                                    ],
+                                    [
+                                        {
+                                            "role": "user",
+                                            "content": "You are provided a pair of responses (Re..."
+                                        }
+                                    ]
+                                ],
+                                "positional_bias_assessment": [
+                                    [
+                                        {
+                                            "role": "user",
+                                            "content": "You are provided a pair of responses (Re..."
+                                        }
+                                    ],
+                                    [
+                                        {
+                                            "role": "user",
+                                            "content": "You are provided a pair of responses (Re..."
+                                        }
+                                    ]
+                                ],
+                                "option_selection": [
+                                    [
+                                        {
+                                            "content": "You are provided a pair of responses (Re...",
+                                            "role": "user"
+                                        },
+                                        {
+                                            "content": "To determine the better response accordi...",
+                                            "role": "assistant"
+                                        },
+                                        {
+                                            "content": "Now considering the evaluation criteria,...",
+                                            "role": "user"
+                                        }
+                                    ],
+                                    [
+                                        {
+                                            "content": "You are provided a pair of responses (Re...",
+                                            "role": "user"
+                                        },
+                                        {
+                                            "content": "To determine the better response accordi...",
+                                            "role": "assistant"
+                                        },
+                                        {
+                                            "content": "Now considering the evaluation criteria,...",
+                                            "role": "user"
+                                        }
+                                    ]
+                                ],
+                                "positional_bias_option_selection": [
+                                    [
+                                        {
+                                            "content": "You are provided a pair of responses (Re...",
+                                            "role": "user"
+                                        },
+                                        {
+                                            "content": "To determine the better response accordi...",
+                                            "role": "assistant"
+                                        },
+                                        {
+                                            "content": "Now considering the evaluation criteria,...",
+                                            "role": "user"
+                                        }
+                                    ],
+                                    [
+                                        {
+                                            "content": "You are provided a pair of responses (Re...",
+                                            "role": "user"
+                                        },
+                                        {
+                                            "content": "To determine the better response accordi...",
+                                            "role": "assistant"
+                                        },
+                                        {
+                                            "content": "Now considering the evaluation criteria,...",
+                                            "role": "user"
+                                        }
+                                    ]
+                                ]
+                            },
+                            "system1_winrate": 1.0,
+                            "system1_llm_as_judge": 1.0,
+                            "system1_ranking": 1,
+                            "system1_response_name": "system1",
+                            "system2_contest_results": [
+                                false,
+                                true
+                            ],
+                            "system2_selections": [
+                                "system1",
+                                "system2"
+                            ],
+                            "system2_compared_to": [
+                                "system1",
+                                "system3"
+                            ],
+                            "system2_assessments": [
+                                "To determine the better response accordi...",
+                                "To determine the better response accordi..."
+                            ],
+                            "system2_positional_bias_assessments": [
+                                "To determine the better response accordi...",
+                                "To determine the better response accordi..."
+                            ],
+                            "system2_option_selection_outputs": [
+                                "system1",
+                                "system2"
+                            ],
+                            "system2_positional_bias": [
+                                false,
+                                false
+                            ],
+                            "system2_positional_bias_selection": [
+                                "system1",
+                                "system2"
+                            ],
+                            "system2_prompts": {
+                                "assessment": [
+                                    [
+                                        {
+                                            "role": "user",
+                                            "content": "You are provided a pair of responses (Re..."
+                                        }
+                                    ],
+                                    [
+                                        {
+                                            "role": "user",
+                                            "content": "You are provided a pair of responses (Re..."
+                                        }
+                                    ]
+                                ],
+                                "positional_bias_assessment": [
+                                    [
+                                        {
+                                            "role": "user",
+                                            "content": "You are provided a pair of responses (Re..."
+                                        }
+                                    ],
+                                    [
+                                        {
+                                            "role": "user",
+                                            "content": "You are provided a pair of responses (Re..."
+                                        }
+                                    ]
+                                ],
+                                "option_selection": [
+                                    [
+                                        {
+                                            "content": "You are provided a pair of responses (Re...",
+                                            "role": "user"
+                                        },
+                                        {
+                                            "content": "To determine the better response accordi...",
+                                            "role": "assistant"
+                                        },
+                                        {
+                                            "content": "Now considering the evaluation criteria,...",
+                                            "role": "user"
+                                        }
+                                    ],
+                                    [
+                                        {
+                                            "content": "You are provided a pair of responses (Re...",
+                                            "role": "user"
+                                        },
+                                        {
+                                            "content": "To determine the better response accordi...",
+                                            "role": "assistant"
+                                        },
+                                        {
+                                            "content": "Now considering the evaluation criteria,...",
+                                            "role": "user"
+                                        }
+                                    ]
+                                ],
+                                "positional_bias_option_selection": [
+                                    [
+                                        {
+                                            "content": "You are provided a pair of responses (Re...",
+                                            "role": "user"
+                                        },
+                                        {
+                                            "content": "To determine the better response accordi...",
+                                            "role": "assistant"
+                                        },
+                                        {
+                                            "content": "Now considering the evaluation criteria,...",
+                                            "role": "user"
+                                        }
+                                    ],
+                                    [
+                                        {
+                                            "content": "You are provided a pair of responses (Re...",
+                                            "role": "user"
+                                        },
+                                        {
+                                            "content": "To determine the better response accordi...",
+                                            "role": "assistant"
+                                        },
+                                        {
+                                            "content": "Now considering the evaluation criteria,...",
+                                            "role": "user"
+                                        }
+                                    ]
+                                ]
+                            },
+                            "system2_winrate": 0.5,
+                            "system2_llm_as_judge": 0.5,
+                            "system2_ranking": 2,
+                            "system2_response_name": "system2",
+                            "system3_contest_results": [
+                                false,
+                                false
+                            ],
+                            "system3_selections": [
+                                "system1",
+                                "system2"
+                            ],
+                            "system3_compared_to": [
+                                "system1",
+                                "system2"
+                            ],
+                            "system3_assessments": [
+                                "To determine the better response accordi...",
+                                "To determine the better response accordi..."
+                            ],
+                            "system3_positional_bias_assessments": [
+                                "To determine the better response accordi...",
+                                "To determine the better response accordi..."
+                            ],
+                            "system3_option_selection_outputs": [
+                                "system1",
+                                "system2"
+                            ],
+                            "system3_positional_bias": [
+                                false,
+                                false
+                            ],
+                            "system3_positional_bias_selection": [
+                                "system1",
+                                "system2"
+                            ],
+                            "system3_prompts": {
+                                "assessment": [
+                                    [
+                                        {
+                                            "role": "user",
+                                            "content": "You are provided a pair of responses (Re..."
+                                        }
+                                    ],
+                                    [
+                                        {
+                                            "role": "user",
+                                            "content": "You are provided a pair of responses (Re..."
+                                        }
+                                    ]
+                                ],
+                                "positional_bias_assessment": [
+                                    [
+                                        {
+                                            "role": "user",
+                                            "content": "You are provided a pair of responses (Re..."
+                                        }
+                                    ],
+                                    [
+                                        {
+                                            "role": "user",
+                                            "content": "You are provided a pair of responses (Re..."
+                                        }
+                                    ]
+                                ],
+                                "option_selection": [
+                                    [
+                                        {
+                                            "content": "You are provided a pair of responses (Re...",
+                                            "role": "user"
+                                        },
+                                        {
+                                            "content": "To determine the better response accordi...",
+                                            "role": "assistant"
+                                        },
+                                        {
+                                            "content": "Now considering the evaluation criteria,...",
+                                            "role": "user"
+                                        }
+                                    ],
+                                    [
+                                        {
+                                            "content": "You are provided a pair of responses (Re...",
+                                            "role": "user"
+                                        },
+                                        {
+                                            "content": "To determine the better response accordi...",
+                                            "role": "assistant"
+                                        },
+                                        {
+                                            "content": "Now considering the evaluation criteria,...",
+                                            "role": "user"
+                                        }
+                                    ]
+                                ],
+                                "positional_bias_option_selection": [
+                                    [
+                                        {
+                                            "content": "You are provided a pair of responses (Re...",
+                                            "role": "user"
+                                        },
+                                        {
+                                            "content": "To determine the better response accordi...",
+                                            "role": "assistant"
+                                        },
+                                        {
+                                            "content": "Now considering the evaluation criteria,...",
+                                            "role": "user"
+                                        }
+                                    ],
+                                    [
+                                        {
+                                            "content": "You are provided a pair of responses (Re...",
+                                            "role": "user"
+                                        },
+                                        {
+                                            "content": "To determine the better response accordi...",
+                                            "role": "assistant"
+                                        },
+                                        {
+                                            "content": "Now considering the evaluation criteria,...",
+                                            "role": "user"
+                                        }
+                                    ]
+                                ]
+                            },
+                            "system3_winrate": 0.0,
+                            "system3_llm_as_judge": 0.0,
+                            "system3_ranking": 3,
+                            "system3_response_name": "system3",
+                            "criteria": "{    \"__type__\": \"criteria\",    \"name\"..."
+                        }
+                    ]
+        """
+        logger.info(
             f'Starting evaluation with evaluator "{self.evaluator_name}" and provider {self.inference_engine.get_pretty_print_name()}'
         )
+        predictions = self.__convert_predictions_to_dicts(predictions)
+        self.__set_main_score(predictions)
         instances_count = len(predictions)
         self.reduction_map = {"mean": ["score"]}
         self.reduction_map["mean"].extend(
             len(combination_indexes) for combination_indexes in combination_indexes_list
         ]
+        logger.info(
             f"The evaluation will perform {sum(contests_count_list) * [1, 2][self.check_positional_bias]} ({' + '.join([f'{c * [1, 2][self.check_positional_bias]}' for c in contests_count_list])}) pairwise comparisons"
         )
             response_pairs_list.append(response_pairs)
             option_pairs_list.append(option_pairs)
+        criterias = self.get_criteria(task_data, instances_count)
         contexts = self.get_contexts(task_data)
         if self.check_positional_bias:
             criterias.extend(criterias)
         assessment_prompts, assessment_outputs, _ = self.perform_evaluation_step(
             assessment_instances, self.assessment_task, self.assessment_template
         )
+        logger.info("The assessment was generated successfully.")
         # the slices used to get the assessment for each summary generation instance
         # it will grab the whole assessment for a particular instance or half of it depending on the value of check_positional_bias
                 self.summarization_task,
                 self.summarization_template,
             )
+            logger.info("The summary was generated successfully.")
         score_option_instruction_list = [
             "".join(
         )
         # Selections are of the form 'Response n', so we just keep n
         selections = [selection.split(" ")[-1] for selection in selections]
+        logger.info("The selections were calculated successfully.")
         results = []
         slice_start = 0
         for i, incremental_contests_count in enumerate(incremental_contests_count_list):
                 (incremental_contests_count_list[i - 1] if i > 0 else 0)
                 + incremental_contests_count,
             )
+            instance_results = self.__get_instance_results(
                 predictions[i],
                 assessment_prompts[sli],
                 assessment_outputs[sli],

llm_as_judge_chat_templates.py CHANGED Viewed

@@ -29,11 +29,13 @@ Assessment: {assessment}
 Summary:"""
     ),
     "answer": InputOutputTemplate(
-        input_format="""Now consider the evaluation criteria and choose a final answer. Only include the chosen answer in the response.
 ###Evaluation criteria:
 {criteria_description}
-{score_option_instruction}
-The selected answer is: """,
         postprocessors=["processors.match_closest_option"],
     ),
 }

 Summary:"""
     ),
     "answer": InputOutputTemplate(
+        input_format="""Now based on the assessment, choose a criteria option. Only include the chosen option in the response. If the assessment already contains a selected option, choose that option. Don't contradict the assessment's selected option.
 ###Evaluation criteria:
 {criteria_description}
+{display_options_instruction}
+The selected criteria option is: """,
         postprocessors=["processors.match_closest_option"],
     ),
 }

llm_as_judge_constants.py CHANGED Viewed

@@ -3,10 +3,6 @@ from enum import Enum
 from typing import Dict, List, Optional
 from .artifact import Artifact
-from .inference import (
-    LiteLLMInferenceEngine,
-    RITSInferenceEngine,
-)
 class OptionSelectionStrategyEnum(str, Enum):
@@ -68,13 +64,13 @@ class EvaluatorTypeEnum(str, Enum):
 class EvaluatorNameEnum(str, Enum):
     MIXTRAL8_7b = "Mixtral8-7b"
-    MIXTRAL8_22b = "Mixtral8-22b"
     MIXTRAL_LARGE = "Mixtral Large"
     LLAMA3_8B = "Llama3-8b"
     LLAMA3_1_405B = "Llama3.1-405b"
     LLAMA3_1_8B = "Llama3.1-8b"
     LLAMA3_1_70B = "Llama3.1-70b"
     LLAMA3_2_3B = "Llama3.2-3b"
     PROMETHEUS = "Prometheus"
     GPT4 = "GPT-4o"
     O1_PREVIEW = "o1-Preview"
@@ -84,53 +80,33 @@ class EvaluatorNameEnum(str, Enum):
     GRANITE3_8B = "Granite3.0-8b"
     GRANITE3_1_2B = "Granite3.1-2b"
     GRANITE3_1_8B = "Granite3.1-8b"
 class ModelProviderEnum(str, Enum):
     WATSONX = "watsonx"
     OPENAI = "openai"
     RITS = "rits"
-    AZURE_OPENAI = "azure_openai"
 EVALUATOR_TO_MODEL_ID = {
-    EvaluatorNameEnum.MIXTRAL8_7b: "mistralai/mixtral-8x7b-instruct-v01",
-    EvaluatorNameEnum.MIXTRAL8_22b: "mistralai/mixtral-8x22B-instruct-v0.1",
-    EvaluatorNameEnum.MIXTRAL_LARGE: "mistralai/mistral-large",
-    EvaluatorNameEnum.LLAMA3_1_405B: "meta-llama/llama-3-405b-instruct",
-    EvaluatorNameEnum.LLAMA3_1_8B: "meta-llama/llama-3-1-8b-instruct",
-    EvaluatorNameEnum.LLAMA3_1_70B: "meta-llama/llama-3-1-70b-instruct",
-    EvaluatorNameEnum.LLAMA3_2_3B: "meta-llama/llama-3-2-3b-instruct",
-    EvaluatorNameEnum.PROMETHEUS: "kaist-ai/prometheus-8x7b-v2",
     EvaluatorNameEnum.GPT4: "gpt-4o-2024-08-06",
-    EvaluatorNameEnum.O1_PREVIEW: "o1-preview-2024-09-12",
-    EvaluatorNameEnum.O1_MINI: "o1-mini-2024-09-12",
-    EvaluatorNameEnum.GRANITE_13B: "ibm/granite-13b-instruct-v2",
-    EvaluatorNameEnum.GRANITE3_2B: "ibm/granite-3-2b-instruct",
-    EvaluatorNameEnum.GRANITE3_8B: "ibm/granite-3-8b-instruct",
-    EvaluatorNameEnum.GRANITE3_1_2B: "ibm/granite-3.1-2b-instruct",
-    EvaluatorNameEnum.GRANITE3_1_8B: "ibm/granite-3.1-8b-instruct",
 }
-MODEL_RENAMINGS = {
-    ModelProviderEnum.RITS: {
-        "meta-llama/llama-3-1-8b-instruct": "meta-llama/Llama-3.1-8B-Instruct",
-        "mistralai/mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7B-instruct-v0.1",
-        "ibm/granite-3-8b-instruct": "ibm-granite/granite-3.0-8b-instruct",
-        "ibm/granite-3.1-8b-instruct": "ibm-granite/granite-3.1-8b-instruct",
-        "meta-llama/llama-3-405b-instruct": "meta-llama/llama-3-1-405b-instruct-fp8",
-        "mistralai/mistral-large": "mistralai/mistral-large-instruct-2407",
-    },
-}
-INFERENCE_ENGINE_NAME_TO_CLASS = {
-    ModelProviderEnum.WATSONX: LiteLLMInferenceEngine,
-    ModelProviderEnum.OPENAI: LiteLLMInferenceEngine,
-    ModelProviderEnum.RITS: RITSInferenceEngine,
-    ModelProviderEnum.AZURE_OPENAI: LiteLLMInferenceEngine,
-}
 class EvaluatorMetadata:
     name: EvaluatorNameEnum
     providers: List[ModelProviderEnum]
@@ -145,10 +121,6 @@ EVALUATORS_METADATA = [
         EvaluatorNameEnum.MIXTRAL8_7b,
         [ModelProviderEnum.RITS, ModelProviderEnum.WATSONX],
     ),
-    EvaluatorMetadata(
-        EvaluatorNameEnum.MIXTRAL8_22b,
-        [ModelProviderEnum.RITS],
-    ),
     EvaluatorMetadata(
         EvaluatorNameEnum.MIXTRAL_LARGE,
         [ModelProviderEnum.RITS, ModelProviderEnum.WATSONX],
@@ -161,6 +133,10 @@ EVALUATORS_METADATA = [
         EvaluatorNameEnum.GRANITE3_1_8B,
         [ModelProviderEnum.RITS],
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.GPT4,
         [ModelProviderEnum.OPENAI, ModelProviderEnum.AZURE_OPENAI],
@@ -185,6 +161,10 @@ EVALUATORS_METADATA = [
         EvaluatorNameEnum.LLAMA3_1_405B,
         [ModelProviderEnum.WATSONX, ModelProviderEnum.RITS],
     ),
 ]
 ################################  Direct Assessment Criterias ################################

 from typing import Dict, List, Optional
 from .artifact import Artifact
 class OptionSelectionStrategyEnum(str, Enum):
 class EvaluatorNameEnum(str, Enum):
     MIXTRAL8_7b = "Mixtral8-7b"
     MIXTRAL_LARGE = "Mixtral Large"
     LLAMA3_8B = "Llama3-8b"
     LLAMA3_1_405B = "Llama3.1-405b"
     LLAMA3_1_8B = "Llama3.1-8b"
     LLAMA3_1_70B = "Llama3.1-70b"
     LLAMA3_2_3B = "Llama3.2-3b"
+    LLAMA3_3_70B = "Llama3.3-70b"
     PROMETHEUS = "Prometheus"
     GPT4 = "GPT-4o"
     O1_PREVIEW = "o1-Preview"
     GRANITE3_8B = "Granite3.0-8b"
     GRANITE3_1_2B = "Granite3.1-2b"
     GRANITE3_1_8B = "Granite3.1-8b"
+    GRANITE3_2_8B = "Granite3.2-8b"
 class ModelProviderEnum(str, Enum):
     WATSONX = "watsonx"
     OPENAI = "openai"
     RITS = "rits"
+    AZURE_OPENAI = "azure"
 EVALUATOR_TO_MODEL_ID = {
+    EvaluatorNameEnum.MIXTRAL8_7b: "mixtral-8x7b-instruct-v01",
+    EvaluatorNameEnum.MIXTRAL_LARGE: "mistral-large-instruct",
+    EvaluatorNameEnum.LLAMA3_1_405B: "llama-3-1-405b-instruct",
+    EvaluatorNameEnum.LLAMA3_1_8B: "llama-3-1-70b-instruct",
+    EvaluatorNameEnum.LLAMA3_1_70B: "llama-3-1-70b-instruct",
+    EvaluatorNameEnum.LLAMA3_3_70B: "llama-3-3-70b-instruct",
     EvaluatorNameEnum.GPT4: "gpt-4o-2024-08-06",
+    EvaluatorNameEnum.O1_PREVIEW: "o1-preview",
+    EvaluatorNameEnum.O1_MINI: "o1-mini",
+    EvaluatorNameEnum.GRANITE3_2B: "granite-3-2b-instruct",
+    EvaluatorNameEnum.GRANITE3_8B: "granite-3-8b-instruct",
+    EvaluatorNameEnum.GRANITE3_1_2B: "granite-3-1-2b-instruct",
+    EvaluatorNameEnum.GRANITE3_1_8B: "granite-3-1-8b-instruct",
+    EvaluatorNameEnum.GRANITE3_2_8B: "granite-3-2-8b-instruct",
 }
 class EvaluatorMetadata:
     name: EvaluatorNameEnum
     providers: List[ModelProviderEnum]
         EvaluatorNameEnum.MIXTRAL8_7b,
         [ModelProviderEnum.RITS, ModelProviderEnum.WATSONX],
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.MIXTRAL_LARGE,
         [ModelProviderEnum.RITS, ModelProviderEnum.WATSONX],
         EvaluatorNameEnum.GRANITE3_1_8B,
         [ModelProviderEnum.RITS],
     ),
+    EvaluatorMetadata(
+        EvaluatorNameEnum.GRANITE3_2_8B,
+        [ModelProviderEnum.WATSONX, ModelProviderEnum.RITS],
+    ),
     EvaluatorMetadata(
         EvaluatorNameEnum.GPT4,
         [ModelProviderEnum.OPENAI, ModelProviderEnum.AZURE_OPENAI],
         EvaluatorNameEnum.LLAMA3_1_405B,
         [ModelProviderEnum.WATSONX, ModelProviderEnum.RITS],
     ),
+    EvaluatorMetadata(
+        EvaluatorNameEnum.LLAMA3_3_70B,
+        [ModelProviderEnum.WATSONX, ModelProviderEnum.RITS],
+    ),
 ]
 ################################  Direct Assessment Criterias ################################

llm_as_judge_utils.py CHANGED Viewed

@@ -2,10 +2,8 @@ from typing import Dict
 from .llm_as_judge_constants import (
     EVALUATORS_METADATA,
-    MODEL_RENAMINGS,
     EvaluatorMetadata,
     EvaluatorNameEnum,
-    ModelProviderEnum,
 )
@@ -32,13 +30,6 @@ def get_evaluator_metadata(
         raise ValueError(f"An evaluator with id {name} matched several models.")
     return evaluator_search[0]
-def rename_model_if_required(model_name: str, provider: ModelProviderEnum) -> str:
-    if provider in MODEL_RENAMINGS and model_name in MODEL_RENAMINGS[provider]:
-        return MODEL_RENAMINGS[provider][model_name]
-    return model_name
 def rank_indexes(numbers):
     # Generate the initial list of indices
     indices = list(range(len(numbers)))

 from .llm_as_judge_constants import (
     EVALUATORS_METADATA,
     EvaluatorMetadata,
     EvaluatorNameEnum,
 )
         raise ValueError(f"An evaluator with id {name} matched several models.")
     return evaluator_search[0]
 def rank_indexes(numbers):
     # Generate the initial list of indices
     indices = list(range(len(numbers)))

loaders.py CHANGED Viewed

@@ -67,7 +67,7 @@ from huggingface_hub import HfApi
 from tqdm import tqdm
 from .dataclass import NonPositionalField
-from .error_utils import UnitxtError, UnitxtWarning
 from .fusion import FixedFusion
 from .logging_utils import get_logger
 from .operator import SourceOperator
@@ -80,19 +80,27 @@ from .utils import LRUCache, recursive_copy
 logger = get_logger()
 settings = get_settings()
 def hf_load_dataset(path: str, *args, **kwargs):
     if settings.hf_offline_datasets_path is not None:
         path = os.path.join(settings.hf_offline_datasets_path, path)
-    return _hf_load_dataset(
-        path,
-        *args, **kwargs,
-            download_config=DownloadConfig(
-                max_retries=settings.loaders_max_retries,
-            ),
-            verification_mode="no_checks",
-            trust_remote_code=settings.allow_unverified_code,
-            download_mode= "force_redownload" if settings.disable_hf_datasets_cache else "reuse_dataset_if_exists"
-        )
 class Loader(SourceOperator):
     """A base class for all loaders.
@@ -288,26 +296,21 @@ class LoadHF(LazyLoader):
         if dataset is None:
             if streaming is None:
                 streaming = self.is_streaming()
-            try:
-                dataset = hf_load_dataset(
-                    self.path,
-                    name=self.name,
-                    data_dir=self.data_dir,
-                    data_files=self.data_files,
-                    revision=self.revision,
-                    streaming=streaming,
-                    split=split,
-                    num_proc=self.num_proc,
-                )
-            except ValueError as e:
-                if "trust_remote_code" in str(e):
-                    raise ValueError(
-                        f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE."
-                    ) from e
             self.__class__._loader_cache.max_size = settings.loader_cache_size
             if not disable_memory_caching:
                 self.__class__._loader_cache[dataset_id] = dataset
-        return self.__class__._loader_cache[dataset_id]
     def _maybe_set_classification_policy(self):
         if os.path.exists(self.path):
@@ -333,7 +336,9 @@ class LoadHF(LazyLoader):
                     extract_on_the_fly=True,
                 ),
             )
-        except:
             UnitxtWarning(
                 f'LoadHF(path="{self.path}", name="{self.name}") could not retrieve split names without loading the dataset. Consider defining "splits" in the LoadHF definition to improve loading time.'
             )
@@ -599,11 +604,11 @@ class LoadFromIBMCloud(Loader):
             load_ibm_cloud = LoadFromIBMCloud(
                 endpoint_url_env='IBM_CLOUD_ENDPOINT',
                 aws_access_key_id_env='IBM_AWS_ACCESS_KEY_ID',
-                aws_secret_access_key_env='IBM_AWS_SECRET_ACCESS_KEY',
                 bucket_name='my-bucket'
             )
             multi_stream = load_ibm_cloud.process()
-    """ # pragma: allowlist secret
     endpoint_url_env: str
     aws_access_key_id_env: str

 from tqdm import tqdm
 from .dataclass import NonPositionalField
+from .error_utils import Documentation, UnitxtError, UnitxtWarning
 from .fusion import FixedFusion
 from .logging_utils import get_logger
 from .operator import SourceOperator
 logger = get_logger()
 settings = get_settings()
+class UnitxtUnverifiedCodeError(UnitxtError):
+    def __init__(self, path):
+        super().__init__(f"Loader cannot load and run remote code from {path} in huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE.", Documentation.SETTINGS)
 def hf_load_dataset(path: str, *args, **kwargs):
     if settings.hf_offline_datasets_path is not None:
         path = os.path.join(settings.hf_offline_datasets_path, path)
+    try:
+        return _hf_load_dataset(
+            path,
+            *args, **kwargs,
+                download_config=DownloadConfig(
+                    max_retries=settings.loaders_max_retries,
+                ),
+                verification_mode="no_checks",
+                trust_remote_code=settings.allow_unverified_code,
+                download_mode= "force_redownload" if settings.disable_hf_datasets_cache else "reuse_dataset_if_exists"
+            )
+    except ValueError as e:
+        if "trust_remote_code" in str(e):
+            raise UnitxtUnverifiedCodeError(path) from e
 class Loader(SourceOperator):
     """A base class for all loaders.
         if dataset is None:
             if streaming is None:
                 streaming = self.is_streaming()
+            dataset = hf_load_dataset(
+                self.path,
+                name=self.name,
+                data_dir=self.data_dir,
+                data_files=self.data_files,
+                revision=self.revision,
+                streaming=streaming,
+                split=split,
+                num_proc=self.num_proc,
+            )
             self.__class__._loader_cache.max_size = settings.loader_cache_size
             if not disable_memory_caching:
                 self.__class__._loader_cache[dataset_id] = dataset
+        return dataset
     def _maybe_set_classification_policy(self):
         if os.path.exists(self.path):
                     extract_on_the_fly=True,
                 ),
             )
+        except Exception as e:
+            if "trust_remote_code" in str(e):
+                raise UnitxtUnverifiedCodeError(self.path) from e
             UnitxtWarning(
                 f'LoadHF(path="{self.path}", name="{self.name}") could not retrieve split names without loading the dataset. Consider defining "splits" in the LoadHF definition to improve loading time.'
             )
             load_ibm_cloud = LoadFromIBMCloud(
                 endpoint_url_env='IBM_CLOUD_ENDPOINT',
                 aws_access_key_id_env='IBM_AWS_ACCESS_KEY_ID',
+                aws_secret_access_key_env='IBM_AWS_SECRET_ACCESS_KEY', # pragma: allowlist secret
                 bucket_name='my-bucket'
             )
             multi_stream = load_ibm_cloud.process()
+    """
     endpoint_url_env: str
     aws_access_key_id_env: str

metrics.py CHANGED Viewed

@@ -63,13 +63,10 @@ from .operator import (
 from .operators import ArtifactFetcherMixin, Copy, Set
 from .random_utils import get_seed
 from .settings_utils import get_settings
-from .sql_utils import get_db_connector
 from .stream import MultiStream, Stream
 from .type_utils import Type, isoftype, parse_type_string, to_type_string
 from .utils import deep_copy, recursive_copy
-FINQA_HASH = "42430b8613082bb4b85d49210284135d"
 logger = get_logger()
 settings = get_settings()
@@ -127,13 +124,18 @@ def nan_mean(x):
 def nan_max(x):
     with warnings.catch_warnings():
-        # final mean should be mean of scores, ignoring NaN, hence nanmax
-        # but if the group function values is NaN for ALL values, nanmean throws a
-        # RuntimeWarning that it is calculating the mean of an empty slice (with no non-Nans)
-        # this is the desired behavior, but we want to avoid the warning here
         warnings.simplefilter("ignore", category=RuntimeWarning)
         return np.nanmax(x)
 class UpdateStream(InstanceOperator):
     update: dict
@@ -365,6 +367,43 @@ def new_random_generator():
     return np.random.default_rng(hash(get_seed()) & _max_32bit)
 class ConfidenceIntervalMixin(Artifact):
     n_resamples: int = 1000
     confidence_level: float = 0.95
@@ -374,42 +413,41 @@ class ConfidenceIntervalMixin(Artifact):
     def _sample_to_scores(self, sample: List[Any]) -> Dict[str, Any]:
         pass
-    def get_statistic(self, data: List[Any], score_names: List[str]):
-        def statistic_function(indices, axis=0):
-            # indices might be a 1D or 2D array, depending on bootstrap internals
-            # For simplicity, ensure we handle them as 1D.
-            indices = np.atleast_1d(indices).astype(int)
-            # Gather the subset
-            sample = [data[i] for i in indices]
-            # Compute metrics on this sample
-            scores = self._sample_to_scores(sample)
-            # Return them in consistent order
-            return np.array([scores[m] for m in score_names])
-        return statistic_function
     def bootstrap(self, data: List[Any], score_names: List[str]):
         if self.ci_score_names is not None:
             score_names = self.ci_score_names
-        intervals = bootstrap(
-            (np.arange(len(data)),),
-            statistic=self.get_statistic(data, score_names),
-            n_resamples=self.n_resamples,
-            confidence_level=self.confidence_level,
-            random_state=new_random_generator(),
-            paired=False,
-            vectorized=False,  # set to True if your statistic function is vectorized
-            method="BCa",
-        ).confidence_interval
         result = {}
         for i, metric in enumerate(score_names):
-            result[f"{metric}_ci_low"] = float(intervals.low[i])
-            result[f"{metric}_ci_high"] = float(intervals.high[i])
         return result
@@ -2769,7 +2807,7 @@ class FinQAEval(InstanceMetric):
         remote_url = "https://raw.githubusercontent.com/czyssrs/FinQA/dfc5b72c01ee17c442d28d5201b82a1f4e95d5af/code/evaluate/evaluate.py"
         local_filepath = "/tmp/finqa_eval_script.py"
         module_name = "finqa_eval"
-        hash_of_script = FINQA_HASH
         download_finqa_eval_script_file(remote_url, local_filepath, hash_of_script)
         self.finqa_module = load_finqa_eval_module_from_file(
@@ -3375,25 +3413,83 @@ class CustomF1(GlobalMetric):
             result["precision_macro"] = self.zero_division
-class NER(CustomF1):
-    """F1 Metrics that receives as input a list of (Entity,EntityType) pairs."""
-    prediction_type = List[Tuple[str, str]]
-    def get_element_group(self, element, additional_input):
-        return element[1]
-    def get_element_representation(self, element, additional_input):
-        return str(element)
-class KeyValueExtraction(CustomF1):
-    """F1 Metrics that receives as input a list of (Key,Value) pairs."""
     prediction_type = List[Tuple[str, str]]
     def get_element_group(self, element, additional_input):
-        return element[0]
     def get_element_representation(self, element, additional_input):
         return str(element)
@@ -6004,6 +6100,9 @@ class GraniteGuardianBase(InstanceMetric):
         )
     def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> dict:
         self.verify_granite_guardian_config(task_data)
         self.set_main_score()
@@ -6017,7 +6116,10 @@ class GraniteGuardianBase(InstanceMetric):
         )
         messages = self.process_input_fields(task_data)
         prompt = self.get_prompt(messages)
-        result = self.inference_engine.infer_log_probs([{"source": prompt}])
         generated_tokens_list = result[0]
         label, prob_of_risk = self.parse_output(generated_tokens_list)
         confidence_score = (
@@ -6030,6 +6132,7 @@ class GraniteGuardianBase(InstanceMetric):
             f"{self.main_score}_prob_of_risk": prob_of_risk,
             f"{self.main_score}_certainty": confidence_score,
             f"{self.main_score}_label": label,
         }
         logger.debug(f"Results are ready:\n{result}")
         return result
@@ -6042,7 +6145,7 @@ class GraniteGuardianBase(InstanceMetric):
             generated_tokens["top_tokens"] for generated_tokens in generated_tokens_list
         ]
         prob = self.get_probabilities(top_tokens_list)
-        prob_of_risk = prob[1]
         res = next(iter(generated_tokens_list))["text"].strip()
@@ -6055,7 +6158,7 @@ class GraniteGuardianBase(InstanceMetric):
         return label, prob_of_risk
-    def get_probabilities(self, top_tokens_list):
         import torch
         safe_token_prob = 1e-50
@@ -6254,7 +6357,7 @@ class SQLExecutionAccuracy(InstanceMetric):
     _requirements_list = ["sqlglot", "func_timeout"]
     @staticmethod
-    def compare_dfs_ignore_colnames(df1, df2):
         """Compares two DataFrames based on row content, ignoring column names.
         Args:
@@ -6262,7 +6365,7 @@ class SQLExecutionAccuracy(InstanceMetric):
             df2 (pd.DataFrame): Pandas DataFrame 2 to compare.
         Returns:
-            True if the DataFrames have the same content (ignoring column names),
             False otherwise.
         """
         df1.fillna(0, inplace=True)
@@ -6276,6 +6379,20 @@ class SQLExecutionAccuracy(InstanceMetric):
         return df1_rows_sorted == df2_rows_sorted
     @staticmethod
     def is_subset_ignore_colnames(df1, df2):
         """Checks if df1 is a subset of df2 based on row content, ignoring column names.
@@ -6343,6 +6460,7 @@ class SQLExecutionAccuracy(InstanceMetric):
         import time
         from func_timeout import func_timeout
         from .sql_utils import sqlglot_optimized_equivalence
@@ -6358,6 +6476,9 @@ class SQLExecutionAccuracy(InstanceMetric):
             )
             end_time = time.perf_counter()
             gold_sql_runtime = end_time - start_time
         except Exception as e:
             gold_error = f"Error executing gold SQL: {e}"
         if gold_error is not None:
@@ -6389,10 +6510,10 @@ class SQLExecutionAccuracy(InstanceMetric):
             gold_sql_runtime,
             0,
             0,
-            1,
             0,
             gold_df.to_json(),
-            gold_df.to_json(),
             "",
         )
         if predicted_sql.lower().strip() == gold_sql.lower().strip():
@@ -6417,6 +6538,9 @@ class SQLExecutionAccuracy(InstanceMetric):
             )
             end_time = time.perf_counter()
             pred_sql_runtime = end_time - start_time
         except Exception as e:
             pred_error = f"Error executing predicted SQL: {e}"
             logger.info(pred_error)
@@ -6445,9 +6569,20 @@ class SQLExecutionAccuracy(InstanceMetric):
             pred_res = pred_res["results"]
         predicted_df = pd.DataFrame(pred_res)
-        execution_result = (
-            1 if self.compare_dfs_ignore_colnames(predicted_df, gold_df) else 0
-        )
         subset_non_empty_execution_result = 0
         non_empty_execution_result = 0
@@ -6473,6 +6608,8 @@ class SQLExecutionAccuracy(InstanceMetric):
         )
     def compute(self, references: List[Any], prediction: str, task_data: Dict) -> dict:
         predicted_sql = prediction
         execution_result: float = 0.0

 from .operators import ArtifactFetcherMixin, Copy, Set
 from .random_utils import get_seed
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
 from .type_utils import Type, isoftype, parse_type_string, to_type_string
 from .utils import deep_copy, recursive_copy
 logger = get_logger()
 settings = get_settings()
 def nan_max(x):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", category=RuntimeWarning)
         return np.nanmax(x)
+def nan_std(x):
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=RuntimeWarning)
+        result = np.nanstd(x)
+        try:
+            return float(result)
+        except:
+            return result
 class UpdateStream(InstanceOperator):
     update: dict
     return np.random.default_rng(hash(get_seed()) & _max_32bit)
+class Statistic:
+    """Statistic for which the confidence interval is to be calculated.
+    `statistic` must be a callable that accepts ``len(data)`` samples
+    as separate arguments and returns the resulting statistic.
+    If `vectorized` is set ``True``,
+    `statistic` must also accept a keyword argument `axis` and be
+    vectorized to compute the statistic along the provided `axis`.
+    """
+    def __init__(self, data, score_names, scorer):
+        self.data = data
+        self.score_names = score_names
+        self.scorer = scorer
+        self._history = []
+    def __call__(self, indices, axis=0):
+        # indices might be a 1D or 2D array, depending on bootstrap internals
+        # For simplicity, ensure we handle them as 1D.
+        indices = np.atleast_1d(indices).astype(int)
+        # Gather the subset
+        sample = [self.data[i] for i in indices]
+        # Compute metrics on this sample
+        scores = self.scorer(sample)
+        # Return them in consistent order
+        result = np.array([scores[m] for m in self.score_names])
+        self._history.append(result)
+        return result
+    def mean(self, idx):
+        return nan_mean([result[idx] for result in self._history])
+    def std(self, idx):
+        return nan_std([result[idx] for result in self._history])
 class ConfidenceIntervalMixin(Artifact):
     n_resamples: int = 1000
     confidence_level: float = 0.95
     def _sample_to_scores(self, sample: List[Any]) -> Dict[str, Any]:
         pass
     def bootstrap(self, data: List[Any], score_names: List[str]):
         if self.ci_score_names is not None:
             score_names = self.ci_score_names
+        statistic = Statistic(data, score_names, self._sample_to_scores)
+        with warnings.catch_warnings():
+            warnings.filterwarnings( # Ignore error the arises when all sample scores are identical
+                "ignore",
+                message="invalid value encountered in divide",
+                category=RuntimeWarning
+            )
+            intervals = bootstrap(
+                (np.arange(len(data)),),
+                statistic=statistic,
+                n_resamples=self.n_resamples,
+                confidence_level=self.confidence_level,
+                random_state=new_random_generator(),
+                paired=False,
+                vectorized=False,
+                method="BCa",
+            ).confidence_interval
         result = {}
         for i, metric in enumerate(score_names):
+            high = intervals.high[i]
+            low = intervals.low[i]
+            if np.isnan(high) and np.isnan(low):
+                if statistic.std(i) == 0: # When sample scores are identical "BCa" will fail (due to division by std 0)
+                    high = low = statistic.mean(i) # In this case we will use the mean (as there is no variance)
+            result[f"{metric}_ci_low"] = float(low)
+            result[f"{metric}_ci_high"] = float(high)
         return result
         remote_url = "https://raw.githubusercontent.com/czyssrs/FinQA/dfc5b72c01ee17c442d28d5201b82a1f4e95d5af/code/evaluate/evaluate.py"
         local_filepath = "/tmp/finqa_eval_script.py"
         module_name = "finqa_eval"
+        hash_of_script = "42430b8613082bb4b85d49210284135d" # pragma: allowlist secret
         download_finqa_eval_script_file(remote_url, local_filepath, hash_of_script)
         self.finqa_module = load_finqa_eval_module_from_file(
             result["precision_macro"] = self.zero_division
+class KeyValueExtraction(GlobalMetric):
+    prediction_type = Dict[str,str]
+    metric : Metric
+    single_reference_per_prediction = True
+    main_score = ""
+    def prepare(self):
+        super().prepare()
+        self.main_score = f"{self.metric.main_score}_micro"
+    def compute(
+        self,
+        references: List[List[Any]],
+        predictions: List[Any],
+        task_data: List[Dict],
+    ) -> dict:
+        references = [element[0] for element in references]
+        key_statistics = {}
+        all_reference_keys = set()
+        for reference in references:
+            all_reference_keys.update(list(reference.keys()))
+        for key in all_reference_keys:
+            key_statistics[key]= []
+        num_prediction_keys=0
+        illegal_prediction_keys=0
+        for reference, prediction in zip(references, predictions):
+            for key in all_reference_keys:
+                if (key not in reference and key not in prediction):
+                    continue
+                if (key in reference and key in prediction):
+                    multi_stream = MultiStream.from_iterables({"test": [{"prediction" : prediction[key],
+                                                                        "references" : [reference[key]]}
+                                                                                                                                                                                                          ]})
+                    output_multi_stream = self.metric(multi_stream)
+                    output_stream = output_multi_stream["test"]
+                    score = next(iter(output_stream))["score"]["global"]["score"]
+                    key_statistics[key].append(score)
+                else:
+                    key_statistics[key].append(0.0)
+            for key in prediction.keys():
+                num_prediction_keys += 1
+                if key not in all_reference_keys:
+                    illegal_prediction_keys += 1
+        result={}
+        average = 0
+        total = 0
+        weighted_average = 0
+        for key in key_statistics:
+            mean_for_key = numpy.mean(key_statistics[key])
+            num = len(key_statistics[key])
+            total += num
+            average += mean_for_key
+            weighted_average += mean_for_key * num
+            result[f"{self.metric.main_score}_{key}"] = mean_for_key
+        result[f"{self.metric.main_score}_micro"] = weighted_average / total
+        result[f"{self.metric.main_score}_macro"] = average / len(key_statistics)
+        if (num_prediction_keys !=0):
+            result[f"{self.metric.main_score}_legal_keys_in_predictions"] = 1 - 1.0 * illegal_prediction_keys /  num_prediction_keys
+        else:
+            result[f"{self.metric.main_score}_legal_keys_in_predictions"] = 0
+        return result
+class NER(CustomF1):
+    """F1 Metrics that receives as input a list of (Entity,EntityType) pairs."""
     prediction_type = List[Tuple[str, str]]
     def get_element_group(self, element, additional_input):
+        return element[1]
     def get_element_representation(self, element, additional_input):
         return str(element)
         )
     def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> dict:
+        # TODO replace with logic inside verify_granite_guardian_config and process_input_fields
+        task_data["prediction"] = prediction
         self.verify_granite_guardian_config(task_data)
         self.set_main_score()
         )
         messages = self.process_input_fields(task_data)
         prompt = self.get_prompt(messages)
+        data_classification_policy = task_data.get("metadata", {}).get("data_classification_policy")
+        result = self.inference_engine.infer_log_probs([{"source": prompt, "data_classification_policy": data_classification_policy}])
         generated_tokens_list = result[0]
         label, prob_of_risk = self.parse_output(generated_tokens_list)
         confidence_score = (
             f"{self.main_score}_prob_of_risk": prob_of_risk,
             f"{self.main_score}_certainty": confidence_score,
             f"{self.main_score}_label": label,
+            f"{self.main_score}_prompt": prompt,
         }
         logger.debug(f"Results are ready:\n{result}")
         return result
             generated_tokens["top_tokens"] for generated_tokens in generated_tokens_list
         ]
         prob = self.get_probabilities(top_tokens_list)
+        prob_of_risk = prob[1].item()
         res = next(iter(generated_tokens_list))["text"].strip()
         return label, prob_of_risk
+    def get_probabilities(self, top_tokens_list) -> Tuple[np.float32, np.float32]:
         import torch
         safe_token_prob = 1e-50
     _requirements_list = ["sqlglot", "func_timeout"]
     @staticmethod
+    def compare_dfs_ignore_colnames_ordered_rows(df1, df2):
         """Compares two DataFrames based on row content, ignoring column names.
         Args:
             df2 (pd.DataFrame): Pandas DataFrame 2 to compare.
         Returns:
+            True if the DataFrames have the same ordered rows (ignoring column names),
             False otherwise.
         """
         df1.fillna(0, inplace=True)
         return df1_rows_sorted == df2_rows_sorted
+    @staticmethod
+    def compare_dfs_ignore_colnames_unordered_rows(df1, df2):
+        """Compares two DataFrames based on row content, ignoring row order and column names.
+        Args:
+            df1 (pd.DataFrame): Pandas DataFrame 1 to compare.
+            df2 (pd.DataFrame): Pandas DataFrame 2 to compare.
+        Returns:
+            True if the DataFrames have the same content (ignoring column names and row order),
+            False otherwise.
+        """
+        return set(map(tuple, df1.to_numpy())) == set(map(tuple, df2.to_numpy()))
     @staticmethod
     def is_subset_ignore_colnames(df1, df2):
         """Checks if df1 is a subset of df2 based on row content, ignoring column names.
         import time
         from func_timeout import func_timeout
+        from func_timeout.exceptions import FunctionTimedOut
         from .sql_utils import sqlglot_optimized_equivalence
             )
             end_time = time.perf_counter()
             gold_sql_runtime = end_time - start_time
+        except FunctionTimedOut as e:
+            pred_error = f"Timeout error executing gold SQL: {e}"
+            logger.warning(pred_error)
         except Exception as e:
             gold_error = f"Error executing gold SQL: {e}"
         if gold_error is not None:
             gold_sql_runtime,
             0,
             0,
+            0,
             0,
             gold_df.to_json(),
+            "",
             "",
         )
         if predicted_sql.lower().strip() == gold_sql.lower().strip():
             )
             end_time = time.perf_counter()
             pred_sql_runtime = end_time - start_time
+        except FunctionTimedOut as e:
+            pred_error = f"Timeout error executing predicted SQL: {e}"
+            logger.info(pred_error)
         except Exception as e:
             pred_error = f"Error executing predicted SQL: {e}"
             logger.info(pred_error)
             pred_res = pred_res["results"]
         predicted_df = pd.DataFrame(pred_res)
+        if "ORDER BY" in gold_sql.upper():
+            execution_result = (
+                1
+                if self.compare_dfs_ignore_colnames_ordered_rows(predicted_df, gold_df)
+                else 0
+            )
+        else:
+            execution_result = (
+                1
+                if self.compare_dfs_ignore_colnames_unordered_rows(
+                    predicted_df, gold_df
+                )
+                else 0
+            )
         subset_non_empty_execution_result = 0
         non_empty_execution_result = 0
         )
     def compute(self, references: List[Any], prediction: str, task_data: Dict) -> dict:
+        from .sql_utils import get_db_connector
         predicted_sql = prediction
         execution_result: float = 0.0

schema.py CHANGED Viewed

@@ -67,8 +67,7 @@ def load_chat_source(chat_str):
                     )
     return chat
-def loads_instance(batch):
     if (
         "source" in batch
         and isinstance(batch["source"][0], str)
@@ -86,6 +85,24 @@ def loads_instance(batch):
         batch["task_data"] = [json.loads(d) for d in batch["task_data"]]
     return batch
 class FinalizeDataset(InstanceOperatorValidator):
     group_by: List[List[str]]

                     )
     return chat
+def loads_batch(batch):
     if (
         "source" in batch
         and isinstance(batch["source"][0], str)
         batch["task_data"] = [json.loads(d) for d in batch["task_data"]]
     return batch
+def loads_instance(instance):
+    if (
+        "source" in instance
+        and isinstance(instance["source"], str)
+        and (
+            instance["source"].startswith('[{"role":')
+            or instance["source"].startswith('[{"content":')
+        )
+    ):
+        instance["source"] = load_chat_source(instance["source"])
+    if (
+        not settings.task_data_as_text
+        and "task_data" in instance
+        and isinstance(instance["task_data"], str)
+    ):
+        instance["task_data"] = json.loads(instance["task_data"])
+    return instance
 class FinalizeDataset(InstanceOperatorValidator):
     group_by: List[List[str]]

serializers.py CHANGED Viewed

@@ -7,7 +7,6 @@ from typing import Any, Dict, List, Union
 from .dataclass import AbstractField, Field
 from .operators import InstanceFieldOperator
 from .settings_utils import get_constants
-from .sql_utils import get_db_connector
 from .type_utils import isoftype, to_type_string
 from .types import (
     Dialog,
@@ -203,5 +202,7 @@ class SQLDatabaseAsSchemaSerializer(SingleTypeSerializer):
     serialized_type = SQLDatabase
     def serialize(self, value: SQLDatabase, instance: Dict[str, Any]) -> str:
         connector = get_db_connector(value["db_type"])(value)
         return connector.get_table_schema()

 from .dataclass import AbstractField, Field
 from .operators import InstanceFieldOperator
 from .settings_utils import get_constants
 from .type_utils import isoftype, to_type_string
 from .types import (
     Dialog,
     serialized_type = SQLDatabase
     def serialize(self, value: SQLDatabase, instance: Dict[str, Any]) -> str:
+        from .sql_utils import get_db_connector
         connector = get_db_connector(value["db_type"])(value)
         return connector.get_table_schema()

settings_utils.py CHANGED Viewed

@@ -159,6 +159,7 @@ if Settings.is_uninitilized():
     settings.hf_offline_datasets_path = None
     settings.hf_offline_metrics_path = None
     settings.hf_offline_models_path = None
 if Constants.is_uninitilized():
     constants = Constants()

     settings.hf_offline_datasets_path = None
     settings.hf_offline_metrics_path = None
     settings.hf_offline_models_path = None
+    settings.inference_engine_cache_path = "./inference_engine_cache/"
 if Constants.is_uninitilized():
     constants = Constants()

sql_utils.py CHANGED Viewed

@@ -1,4 +1,7 @@
 import glob
 import os
 import re
 import sqlite3
@@ -16,6 +19,14 @@ from .types import SQLDatabase
 logger = get_logger()
 class DatabaseConnector(ABC):
     """Abstract base class for database connectors."""
@@ -23,7 +34,7 @@ class DatabaseConnector(ABC):
     def __init__(self, db_config: SQLDatabase):
         self.db_config = db_config
         self.databases_folder = os.path.join(
-            os.environ.get("UNITXT_TEXT2SQL_CACHE", "cache/text2sql"), "databases"
         )
         os.makedirs(self.databases_folder, exist_ok=True)
@@ -187,6 +198,177 @@ class InMemoryDatabaseConnector(DatabaseConnector):
             conn.close()
 @lru_cache(maxsize=128)
 def execute_query_remote(
     api_url: str,
@@ -318,12 +500,20 @@ class RemoteDatabaseConnector(DatabaseConnector):
     def execute_query(self, query: str) -> Any:
         """Executes a query against the remote database, with retries for certain exceptions."""
-        return execute_query_remote(
-            api_url=self.api_url,
-            database_id=self.database_id,
-            api_key=self.api_key,
-            query=query,
-            timeout=self.timeout,
         )

+import functools
 import glob
+import hashlib
+import json
 import os
 import re
 import sqlite3
 logger = get_logger()
+# Check if caching is enabled via environment variable
+CACHE_LOCATION = os.getenv("UNITXT_CACHE_LOCATION")
+# Set max cache size to 10GB or the value of env var MAX_CACHE_SIZE
+MAX_CACHE_SIZE = os.getenv("MAX_CACHE_SIZE", 10 * 1024**3)
+_cache_instance = None
 class DatabaseConnector(ABC):
     """Abstract base class for database connectors."""
     def __init__(self, db_config: SQLDatabase):
         self.db_config = db_config
         self.databases_folder = os.path.join(
+            os.environ.get("UNITXT_CACHE_LOCATION", "cache/text2sql"), "databases"
         )
         os.makedirs(self.databases_folder, exist_ok=True)
             conn.close()
+def get_cache():
+    """Returns a singleton cache instance, initializing it if necessary."""
+    global _cache_instance
+    if _cache_instance is None:
+        _cache_instance = Cache()
+    return _cache_instance
+def generate_cache_key(*args, **kwargs):
+    """Generate a stable hashable cache key for various input types.
+    :param args: Positional arguments of the function.
+    :param kwargs: Keyword arguments of the function.
+    :return: A hashed key as a string.
+    """
+    try:
+        # Convert args and kwargs to a JSON string (sorted to ensure consistency)
+        serialized = json.dumps(
+            {"args": args, "kwargs": kwargs}, sort_keys=True, default=str
+        )
+    except TypeError:
+        # Fallback for non-serializable objects
+        serialized = repr((args, kwargs))
+    # Hash the serialized data
+    return hashlib.md5(serialized.encode()).hexdigest()
+class Cache:
+    """A class that provides disk-based caching functionality for a given function."""
+    def __init__(self):
+        """Initializes the cache.
+        If `CACHE_LOCATION` (os.getenv("UNITXT_CACHE_LOCATION") is set, a disk-based
+        cache is created using `diskcache`.
+        Args:
+            None
+        Returns:
+            None
+        """
+        if CACHE_LOCATION:
+            try:
+                import diskcache
+                # Ensure the cache directory exists
+                os.makedirs(CACHE_LOCATION, exist_ok=True)
+                # Create a global diskcache Cache instance
+                self.cache = diskcache.Cache(CACHE_LOCATION, size_limit=MAX_CACHE_SIZE)
+                logger.info(f"Caching enabled at {CACHE_LOCATION}")
+            except ImportError as e:
+                raise ImportError(
+                    "UNITXT_CACHE_LOCATION is set, but diskcache is not installed.\n"
+                    "Please install diskcache `pip install diskcache` "
+                    "or unset UNITXT_CACHE_LOCATION."
+                ) from e
+        else:
+            self.cache = None  # Disable caching
+    def get_or_set(self, key, compute_fn, no_cache=False, refresh=False):
+        if not self.cache or no_cache:
+            logger.info(f"Bypassing cache for key: {key}")
+            return compute_fn()
+        if refresh and key in self.cache:
+            logger.info(f"Refreshing cache for key: {key}")
+            del self.cache[key]
+        if key in self.cache:
+            logger.info(f"Cache hit for key: {key}")
+            return self.cache[key]
+        logger.info(f"Cache miss for key: {key}. Computing value...")
+        result = compute_fn()
+        self.cache[key] = result
+        logger.info(f"Stored result in cache for key: {key}")
+        return result
+    async def async_get_or_set(self, key, compute_fn, no_cache=False, refresh=False):
+        if not self.cache or no_cache:
+            logger.info(f"Bypassing cache for key: {key}")
+            return await compute_fn()
+        if refresh and key in self.cache:
+            logger.info(f"Refreshing cache for key: {key}")
+            del self.cache[key]
+        if key in self.cache:
+            logger.info(f"Cache hit for key: {key}")
+            return self.cache[key]
+        logger.info(f"Cache miss for key: {key}. Computing value asynchronously...")
+        result = await compute_fn()
+        self.cache[key] = result
+        logger.info(f"Stored result in cache for key: {key}")
+        return result
+    def memoize(self, key_func=generate_cache_key, no_cache=False, refresh=False):
+        def decorator(func):
+            @functools.wraps(func)
+            def wrapper(*args, **kwargs):
+                if not self.cache or no_cache:
+                    logger.info(f"Bypassing cache for function: {func.__name__}")
+                    return func(*args, **kwargs)
+                key = key_func(func.__name__, *args, **kwargs)
+                if refresh and key in self.cache:
+                    logger.info(
+                        f"Refreshing cache for function: {func.__name__}, key: {key}"
+                    )
+                    del self.cache[key]
+                if key in self.cache:
+                    logger.info(f"Cache hit for function: {func.__name__}, key: {key}")
+                    return self.cache[key]
+                logger.info(
+                    f"Cache miss for function: {func.__name__}, key: {key}. Computing value..."
+                )
+                result = func(*args, **kwargs)
+                self.cache[key] = result
+                logger.info(
+                    f"Stored result in cache for function: {func.__name__}, key: {key}"
+                )
+                return result
+            return wrapper
+        return decorator
+    def async_memoize(self, key_func=generate_cache_key, no_cache=False, refresh=False):
+        def decorator(func):
+            @functools.wraps(func)
+            async def wrapper(*args, **kwargs):
+                if no_cache:
+                    logger.info(f"Bypassing cache for async function: {func.__name__}")
+                    return await func(*args, **kwargs)
+                key = key_func(func.__name__, *args, **kwargs)
+                if refresh and key in self.cache:
+                    logger.info(
+                        f"Refreshing cache for async function: {func.__name__}, key: {key}"
+                    )
+                    del self.cache[key]
+                if key in self.cache:
+                    logger.info(
+                        f"Cache hit for async function: {func.__name__}, key: {key}"
+                    )
+                    return self.cache[key]
+                logger.info(
+                    f"Cache miss for async function: {func.__name__}, key: {key}. Computing value..."
+                )
+                result = await func(*args, **kwargs)
+                self.cache[key] = result
+                logger.info(
+                    f"Stored result in cache for async function: {func.__name__}, key: {key}"
+                )
+                return result
+            return wrapper
+        return decorator
 @lru_cache(maxsize=128)
 def execute_query_remote(
     api_url: str,
     def execute_query(self, query: str) -> Any:
         """Executes a query against the remote database, with retries for certain exceptions."""
+        cache = get_cache()
+        cache_key = generate_cache_key(
+            "sql_request", self.api_url, self.database_id, query
+        )
+        return cache.get_or_set(
+            cache_key,
+            lambda: execute_query_remote(
+                api_url=self.api_url,
+                database_id=self.database_id,
+                api_key=self.api_key,
+                query=query,
+                timeout=self.timeout,
+            ),
         )

struct_data_operators.py CHANGED Viewed

@@ -1024,24 +1024,24 @@ class ShuffleColumnsNames(TypeDependentAugmentor):
         return {"header": shuffled_header, "rows": table["rows"]}
-class JsonStrToListOfKeyValuePairs(FieldOperator):
-    """Convert a Json string of representing key value as dictionary to list of key value pairs."""
     def process_value(self, text: str) -> List[Tuple[str, str]]:
         try:
             dict_value = json.loads(text)
         except Exception as e:
             UnitxtWarning(
-                f"Unable to convert input text to json format in JsonStrToListOfKeyValuePairs due to {e}. Text: {text}"
             )
             dict_value = {}
         if not isoftype(dict_value, Dict[str, Any]):
             UnitxtWarning(
-                f"Unable to convert input text to dictionary in JsonStrToListOfKeyValuePairs. Text: {text}"
             )
             dict_value = {}
-        return [
-            (str(key), str(value))
-            for key, value in dict_value.items()
-            if value is not None
-        ]

         return {"header": shuffled_header, "rows": table["rows"]}
+class JsonStrToDict(FieldOperator):
+    """Convert a Json string of representing key value as dictionary.
+    Ensure keys and values are strings, and there are no None values.
+    """
     def process_value(self, text: str) -> List[Tuple[str, str]]:
         try:
             dict_value = json.loads(text)
         except Exception as e:
             UnitxtWarning(
+                f"Unable to convert input text to json format in JsonStrToDict due to {e}. Text: {text}"
             )
             dict_value = {}
         if not isoftype(dict_value, Dict[str, Any]):
             UnitxtWarning(
+                f"Unable to convert input text to dictionary in JsonStrToDict. Text: {text}"
             )
             dict_value = {}
+        return  {str(k): str(v) for k, v in dict_value.items() if v is not None}

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.20.0"


1	+ version = "1.21.0"