Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Mar 9

Commit

35fffae

verified ·

1 Parent(s): 5c24b29

Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

formats.py +5 -5
inference.py +19 -2
llm_as_judge_constants.py +1 -1
llm_as_judge_from_template.py +1 -0
loaders.py +3 -1
metrics.py +37 -38
operator.py +4 -0
operators.py +1 -1
schema.py +3 -3
settings_utils.py +3 -0
standard.py +2 -2
system_prompts.py +4 -2
task.py +8 -2
templates.py +4 -4
text_utils.py +72 -2
version.py +1 -1

formats.py CHANGED Viewed

@@ -116,7 +116,7 @@ def apply_capital_new_line_notation(text: str) -> str:
 class BaseFormat(Format):
-    demos_field: str = "demos"
     @staticmethod
     def _pop_field(instance, field_name, do_pop: bool = True) -> str:
@@ -133,14 +133,14 @@ class BaseFormat(Format):
     def _prepare_instance_fields(self, instance) -> Tuple[str]:
         instance_fields = {}
-        for field in "source", "instruction", "system_prompt", "target_prefix":
             instance_fields[field] = self._pop_field(instance, field)
         instance_fields["media"] = self._pop_field(instance, "media", do_pop=False)
         if not instance_fields["media"]:
             instance_fields["media"] = {"images": [], "audios": []}
-        instance_fields["demos"] = []
         if self.demos_field is not None and self.demos_field in instance:
             demos = instance[self.demos_field]
             assert (
@@ -150,7 +150,7 @@ class BaseFormat(Format):
                 demo = {}
                 for field in ["source", "target", "target_prefix"]:
                     demo[field] = self._pop_field(demo_instance, field, do_pop=False)
-                instance_fields["demos"].append(demo)
         return instance_fields
@@ -219,7 +219,7 @@ class SystemFormat(BaseFormat):
         .. code-block::
             system_format = SystemFormat(
-                demos_field="demos",
                 demo_format="Input: {source}\nOutput: {target}\n\n",
                 model_input_format="Instruction: {instruction}\n\n{demos}Input: {source}\nOutput: ",
             )

 class BaseFormat(Format):
+    demos_field: str = constants.demos_field
     @staticmethod
     def _pop_field(instance, field_name, do_pop: bool = True) -> str:
     def _prepare_instance_fields(self, instance) -> Tuple[str]:
         instance_fields = {}
+        for field in "source", constants.instruction_field, constants.system_prompt_field, "target_prefix":
             instance_fields[field] = self._pop_field(instance, field)
         instance_fields["media"] = self._pop_field(instance, "media", do_pop=False)
         if not instance_fields["media"]:
             instance_fields["media"] = {"images": [], "audios": []}
+        instance_fields[constants.demos_field] = []
         if self.demos_field is not None and self.demos_field in instance:
             demos = instance[self.demos_field]
             assert (
                 demo = {}
                 for field in ["source", "target", "target_prefix"]:
                     demo[field] = self._pop_field(demo_instance, field, do_pop=False)
+                instance_fields[constants.demos_field].append(demo)
         return instance_fields
         .. code-block::
             system_format = SystemFormat(
+                demos_field=constants.demos_field,
                 demo_format="Input: {source}\nOutput: {target}\n\n",
                 model_input_format="Instruction: {instruction}\n\n{demos}Input: {source}\nOutput: ",
             )

inference.py CHANGED Viewed

@@ -1937,6 +1937,9 @@ class WMLChatParamsMixin(Artifact):
     time_limit: Optional[int] = None
     top_p: Optional[float] = None
     n: Optional[int] = None
 CredentialsWML = Dict[
@@ -2486,8 +2489,20 @@ class WMLInferenceEngineChat(WMLInferenceEngineBase, WMLChatParamsMixin):
                             "of messages."
                         )
     def to_messages(self, instance: Union[Dict, List]) -> List[List[Dict[str, Any]]]:
-        if isinstance(instance["source"], str) and "media" in instance:
             return self._create_messages_from_instance(instance)
         messages = super().to_messages(instance)
@@ -2985,7 +3000,9 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             mapping each supported API to a corresponding
             model identifier string. This mapping allows consistent access to models
             across different API backends.
-        provider_specific_args: (Optional[Dict[str, Dict[str,str]]]) Args specific to a provider for example provider_specific_args={"watsonx": {"max_requests_per_second": 4}}
     """
     label: str = "cross_provider"

     time_limit: Optional[int] = None
     top_p: Optional[float] = None
     n: Optional[int] = None
+    seed: Optional[int] = None
+    logit_bias: Optional[Dict[str, Any]] = None
+    stop: Optional[List[str]] = None
 CredentialsWML = Dict[
                             "of messages."
                         )
+    @staticmethod
+    def check_instance_contains_image(instance: Dict[str, Any]) -> bool:
+        if "media" not in instance:
+            return False
+        if not isinstance(instance["media"], dict):
+            return False
+        if "images" not in instance["media"]:
+            return False
+        if not instance["media"]["images"]:
+            return False
+        return True
     def to_messages(self, instance: Union[Dict, List]) -> List[List[Dict[str, Any]]]:
+        if isinstance(instance["source"], str) and self.check_instance_contains_image(instance):
             return self._create_messages_from_instance(instance)
         messages = super().to_messages(instance)
             mapping each supported API to a corresponding
             model identifier string. This mapping allows consistent access to models
             across different API backends.
+        provider_specific_args:
+            (Optional[Dict[str, Dict[str,str]]]) Args specific to a provider for example provider_specific_args={"watsonx": {"max_requests_per_second": 4}}
     """
     label: str = "cross_provider"

llm_as_judge_constants.py CHANGED Viewed

@@ -205,7 +205,7 @@ class DirectCriteriaCatalogEnum(Enum):
             ),
             CriteriaOption(
                 "Pass",
-                "There is no numeriselected_providercal temperature reading in the response.",
             ),
         ],
         {"Yes": 1.0, "No": 0.5, "Pass": 0.0},

             ),
             CriteriaOption(
                 "Pass",
+                "There is no numerical temperature reading in the response.",
             ),
         ],
         {"Yes": 1.0, "No": 0.5, "Pass": 0.0},

llm_as_judge_from_template.py CHANGED Viewed

@@ -37,6 +37,7 @@ class LLMAsJudgeBase(BulkInstanceMetric, ArtifactFetcherMixin):
         inference_model (InferenceEngine): The module that creates the inference of the judge llm.
         reduction_map (dict): A dictionary specifying the reduction method for the metric.
         batch_size (int): The size of the bulk.
     """
     main_score: str = "llm_as_judge"

         inference_model (InferenceEngine): The module that creates the inference of the judge llm.
         reduction_map (dict): A dictionary specifying the reduction method for the metric.
         batch_size (int): The size of the bulk.
     """
     main_score: str = "llm_as_judge"

loaders.py CHANGED Viewed

@@ -443,11 +443,13 @@ class LoadCSV(LazyLoader):
                         dataset = reader(self.files[split], **self.get_args()).to_dict(
                             "records"
                         )
                     except ValueError:
                         import fsspec
                         with fsspec.open(self.files[split], mode="rt") as f:
                             dataset = reader(f, **self.get_args()).to_dict("records")
                 except Exception as e:
                     logger.debug(f"Attempt csv load {attempt + 1} failed: {e}")
                     if attempt < settings.loaders_max_retries - 1:
@@ -601,7 +603,7 @@ class LoadFromIBMCloud(Loader):
                 bucket_name='my-bucket'
             )
             multi_stream = load_ibm_cloud.process()
-    """
     endpoint_url_env: str
     aws_access_key_id_env: str

                         dataset = reader(self.files[split], **self.get_args()).to_dict(
                             "records"
                         )
+                        break
                     except ValueError:
                         import fsspec
                         with fsspec.open(self.files[split], mode="rt") as f:
                             dataset = reader(f, **self.get_args()).to_dict("records")
+                        break
                 except Exception as e:
                     logger.debug(f"Attempt csv load {attempt + 1} failed: {e}")
                     if attempt < settings.loaders_max_retries - 1:
                 bucket_name='my-bucket'
             )
             multi_stream = load_ibm_cloud.process()
+    """ # pragma: allowlist secret
     endpoint_url_env: str
     aws_access_key_id_env: str

metrics.py CHANGED Viewed

@@ -75,6 +75,7 @@ settings = get_settings()
 warnings.filterwarnings("ignore", category=DegenerateDataWarning)
 def hf_evaluate_load(path: str, *args, **kwargs):
     if settings.hf_offline_metrics_path is not None:
         path = os.path.join(settings.hf_offline_metrics_path, path)
@@ -83,13 +84,18 @@ def hf_evaluate_load(path: str, *args, **kwargs):
         *args,
         **kwargs,
         experiment_id=str(uuid.uuid4()),
-         download_config=DownloadConfig(
-                max_retries=settings.loaders_max_retries,
-            ),
-            verification_mode="no_checks",
-            trust_remote_code=settings.allow_unverified_code,
-            download_mode= "force_redownload" if settings.disable_hf_datasets_cache else "reuse_dataset_if_exists"
-        )
 class MetricsList(ListCollection):
     def verify(self):
@@ -2311,13 +2317,11 @@ class HuggingfaceMetric(GlobalMetric):
                 Documentation.HUGGINGFACE_METRICS,
             )
-        assert (
-            self.hf_additional_input_fields is None
-            or isoftype(self.hf_additional_input_fields, List[str])
         ), f"Argument hf_additional_input_fields should be either None or List[str]. It is now: {self.hf_additional_input_fields}."
-        assert (
-            self.hf_additional_input_fields_pass_one_value is None
-            or isoftype(self.hf_additional_input_fields_pass_one_value, List[str])
         ), f"Argument hf_additional_input_fields_pass_one_value should be either None or List[str]. It is now: {self.hf_additional_input_fields_pass_one_value}."
         return super().verify()
@@ -2826,9 +2830,7 @@ class F1MultiLabel(GlobalMetric, PackageRequirementsMixin):
     def prepare(self):
         super().prepare()
-        self._metric = hf_evaluate_load(
-            self.metric, "multilabel"
-        )
     def add_str_to_id(self, str):
         if str not in self.str_to_id:
@@ -2885,8 +2887,8 @@ class F1MultiLabel(GlobalMetric, PackageRequirementsMixin):
             labels=labels_param,
         )
         if isinstance(result[self.metric], numpy.ndarray):
-            assert (
-                len(result[self.metric]) == len(labels)
             ), f"F1 result ({result[self.metric]}) has more entries than labels ({labels})"
             final_result = {self.main_score: nan_mean(result[self.metric])}
             for i, label in enumerate(labels):
@@ -3625,7 +3627,9 @@ class Detector(BulkInstanceMetric):
         if settings.hf_offline_models_path is not None:
             model_path = os.path.join(settings.hf_offline_models_path, model_path)
         self.pipe = pipeline(
-            "text-classification", model=model_path, device=device,
         )
     def compute(
@@ -3662,7 +3666,6 @@ class RegardMetric(GlobalMetric):
             model_path = os.path.join(settings.hf_offline_models_path, model_path)
         self.regard_model = AutoModelForSequenceClassification.from_pretrained(
             model_path,
         )
         self.regard_tokenizer = AutoTokenizer.from_pretrained(model_path)
@@ -3865,9 +3868,9 @@ class LlamaIndexLLMMetric(InstanceMetric):
     prediction_type = str
     reduction_map: Dict[str, List[str]] = None
     openai_models: List[str] = ["gpt-3.5-turbo"]
-    anthropic_models: List[
-        str
-    ] = []  # this is here for the sake of documentation for future models
     mock_models: List[str] = ["mock"]
     external_api_models = openai_models + anthropic_models
     data_classification_policy = ["public"]
@@ -4123,9 +4126,7 @@ class Perplexity(BulkInstanceMetric):
             model_path = self.model_name
             if settings.hf_offline_models_path is not None:
                 model_path = os.path.join(settings.hf_offline_models_path, model_path)
-            self.model = (
-                self.model_class().from_pretrained(model_path).to(self.device)
-            )
             self.tokenizer = AutoTokenizer.from_pretrained(model_path)
             if self.tokenizer.pad_token_id is None:
                 self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
@@ -4291,7 +4292,7 @@ class FaithfulnessHHEM(BulkInstanceMetric):
     batch_size: int = 2
     model_name: str = "vectara/hallucination_evaluation_model"
     prediction_type = str
-   # single_reference_per_prediction = True
     max_context_words = 4096
     reduction_map = {"mean": [main_score]}
@@ -4308,6 +4309,7 @@ class FaithfulnessHHEM(BulkInstanceMetric):
         else:
             device = "cpu"
         from transformers import AutoModelForSequenceClassification
         model_path = self.model_name
         if settings.hf_offline_models_path is not None:
             model_path = os.path.join(settings.hf_offline_models_path, model_path)
@@ -5955,6 +5957,7 @@ class GraniteGuardianBase(InstanceMetric):
     def prepare(self):
         from transformers import AutoTokenizer
         if not isinstance(self.risk_type, RiskType):
             self.risk_type = RiskType[self.risk_type]
         if not hasattr(self, "_tokenizer") or self._tokenizer is None:
@@ -6268,18 +6271,10 @@ class SQLExecutionAccuracy(InstanceMetric):
         if df1.shape != df2.shape:
             return False
-        # run over all columns of d11,
-        # and see if there is a columns in df2 that matches it,
-        # if not return False, if all the columns worked return tue
-        for df1_col in df1.columns:
-            col_matched = False
-            for df2_col in df2.columns:
-                if all(df1[df1_col].values == df2[df2_col].values):
-                    col_matched = True
-            if not col_matched:
-                return False
-        return True
     @staticmethod
     def is_subset_ignore_colnames(df1, df2):
@@ -6381,6 +6376,8 @@ class SQLExecutionAccuracy(InstanceMetric):
                 gold_error,
             )
         gold_df = pd.DataFrame(gold_res)
         non_empty_gold_df = 0 if gold_df.empty else 1
@@ -6444,6 +6441,8 @@ class SQLExecutionAccuracy(InstanceMetric):
                 pred_error,
             )
         predicted_df = pd.DataFrame(pred_res)
         execution_result = (

 warnings.filterwarnings("ignore", category=DegenerateDataWarning)
 def hf_evaluate_load(path: str, *args, **kwargs):
     if settings.hf_offline_metrics_path is not None:
         path = os.path.join(settings.hf_offline_metrics_path, path)
         *args,
         **kwargs,
         experiment_id=str(uuid.uuid4()),
+        download_config=DownloadConfig(
+            max_retries=settings.loaders_max_retries,
+        ),
+        verification_mode="no_checks",
+        trust_remote_code=settings.allow_unverified_code,
+        download_mode=(
+            "force_redownload"
+            if settings.disable_hf_datasets_cache
+            else "reuse_dataset_if_exists"
+        ),
+    )
 class MetricsList(ListCollection):
     def verify(self):
                 Documentation.HUGGINGFACE_METRICS,
             )
+        assert self.hf_additional_input_fields is None or isoftype(
+            self.hf_additional_input_fields, List[str]
         ), f"Argument hf_additional_input_fields should be either None or List[str]. It is now: {self.hf_additional_input_fields}."
+        assert self.hf_additional_input_fields_pass_one_value is None or isoftype(
+            self.hf_additional_input_fields_pass_one_value, List[str]
         ), f"Argument hf_additional_input_fields_pass_one_value should be either None or List[str]. It is now: {self.hf_additional_input_fields_pass_one_value}."
         return super().verify()
     def prepare(self):
         super().prepare()
+        self._metric = hf_evaluate_load(self.metric, "multilabel")
     def add_str_to_id(self, str):
         if str not in self.str_to_id:
             labels=labels_param,
         )
         if isinstance(result[self.metric], numpy.ndarray):
+            assert len(result[self.metric]) == len(
+                labels
             ), f"F1 result ({result[self.metric]}) has more entries than labels ({labels})"
             final_result = {self.main_score: nan_mean(result[self.metric])}
             for i, label in enumerate(labels):
         if settings.hf_offline_models_path is not None:
             model_path = os.path.join(settings.hf_offline_models_path, model_path)
         self.pipe = pipeline(
+            "text-classification",
+            model=model_path,
+            device=device,
         )
     def compute(
             model_path = os.path.join(settings.hf_offline_models_path, model_path)
         self.regard_model = AutoModelForSequenceClassification.from_pretrained(
             model_path,
         )
         self.regard_tokenizer = AutoTokenizer.from_pretrained(model_path)
     prediction_type = str
     reduction_map: Dict[str, List[str]] = None
     openai_models: List[str] = ["gpt-3.5-turbo"]
+    anthropic_models: List[str] = (
+        []
+    )  # this is here for the sake of documentation for future models
     mock_models: List[str] = ["mock"]
     external_api_models = openai_models + anthropic_models
     data_classification_policy = ["public"]
             model_path = self.model_name
             if settings.hf_offline_models_path is not None:
                 model_path = os.path.join(settings.hf_offline_models_path, model_path)
+            self.model = self.model_class().from_pretrained(model_path).to(self.device)
             self.tokenizer = AutoTokenizer.from_pretrained(model_path)
             if self.tokenizer.pad_token_id is None:
                 self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
     batch_size: int = 2
     model_name: str = "vectara/hallucination_evaluation_model"
     prediction_type = str
+    # single_reference_per_prediction = True
     max_context_words = 4096
     reduction_map = {"mean": [main_score]}
         else:
             device = "cpu"
         from transformers import AutoModelForSequenceClassification
         model_path = self.model_name
         if settings.hf_offline_models_path is not None:
             model_path = os.path.join(settings.hf_offline_models_path, model_path)
     def prepare(self):
         from transformers import AutoTokenizer
         if not isinstance(self.risk_type, RiskType):
             self.risk_type = RiskType[self.risk_type]
         if not hasattr(self, "_tokenizer") or self._tokenizer is None:
         if df1.shape != df2.shape:
             return False
+        df1_rows_sorted = [sorted(map(str, row)) for row in df1.to_numpy()]
+        df2_rows_sorted = [sorted(map(str, row)) for row in df2.to_numpy()]
+        return df1_rows_sorted == df2_rows_sorted
     @staticmethod
     def is_subset_ignore_colnames(df1, df2):
                 gold_error,
             )
+        if isinstance(gold_res, dict) and "results" in gold_res:
+            gold_res = gold_res["results"]
         gold_df = pd.DataFrame(gold_res)
         non_empty_gold_df = 0 if gold_df.empty else 1
                 pred_error,
             )
+        if isinstance(pred_res, dict) and "results" in pred_res:
+            pred_res = pred_res["results"]
         predicted_df = pd.DataFrame(pred_res)
         execution_result = (

operator.py CHANGED Viewed

@@ -157,6 +157,7 @@ class StreamingOperator(Operator, PackageRequirementsMixin):
         """
 class SideEffectOperator(StreamingOperator):
     """Base class for operators that does not affect the stream."""
@@ -249,6 +250,9 @@ class SourceOperator(MultiStreamOperator):
         pass
 class StreamInitializerOperator(SourceOperator):
     """A class representing a stream initializer operator in the streaming system.

         """
 class SideEffectOperator(StreamingOperator):
     """Base class for operators that does not affect the stream."""
         pass
+    def get_splits(self):
+        return list(self.process().keys())
 class StreamInitializerOperator(SourceOperator):
     """A class representing a stream initializer operator in the streaming system.

operators.py CHANGED Viewed

@@ -1527,7 +1527,7 @@ class IntersectCorrespondingFields(InstanceOperator):
         if not isinstance(self.allowed_values, list):
             raise ValueError(
-                f"The allowed_field_values is not a type list but '{type(self.allowed_field_values)}'"
             )
     def process(

         if not isinstance(self.allowed_values, list):
             raise ValueError(
+                f"The allowed_values is not a type list but '{type(self.allowed_values)}'"
             )
     def process(

schema.py CHANGED Viewed

@@ -151,10 +151,10 @@ class FinalizeDataset(InstanceOperatorValidator):
         )
         if "criteria" in task_data and isinstance(task_data["criteria"], Artifact):
             task_data["criteria"] = self.artifact_to_jsonable(task_data["criteria"])
-        if "demos" in instance:
-            task_data["demos"] = [
                 self._get_instance_task_data(instance)
-                for instance in instance.pop("demos")
             ]
         instance = self.serialize_instance_fields(instance, task_data)

         )
         if "criteria" in task_data and isinstance(task_data["criteria"], Artifact):
             task_data["criteria"] = self.artifact_to_jsonable(task_data["criteria"])
+        if constants.demos_field in instance:
+            task_data[constants.demos_field] = [
                 self._get_instance_task_data(instance)
+                for instance in instance.pop(constants.demos_field)
             ]
         instance = self.serialize_instance_fields(instance, task_data)

settings_utils.py CHANGED Viewed

@@ -192,6 +192,9 @@ if Constants.is_uninitilized():
     constants.instance_stream = "__INSTANCE_STREAM__"
     constants.image_tag = "unitxt-img"
     constants.demos_pool_field = "_demos_pool_"
 def get_settings() -> Settings:

     constants.instance_stream = "__INSTANCE_STREAM__"
     constants.image_tag = "unitxt-img"
     constants.demos_pool_field = "_demos_pool_"
+    constants.demos_field = "demos"
+    constants.instruction_field = "instruction"
+    constants.system_prompt_field = "system_prompt"
 def get_settings() -> Settings:

standard.py CHANGED Viewed

@@ -276,7 +276,7 @@ class DatasetRecipe(SourceSequentialOperator):
     demos_pool_field_name: str = constants.demos_pool_field
     demos_taken_from: str = "train"
-    demos_field: str = "demos"
     sampler: Sampler = None
     # do not push demos to instances whose "demos" field is already populated
@@ -608,7 +608,7 @@ class DatasetRecipe(SourceSequentialOperator):
                     )
                 )
                 self.verbalization.steps.append(
-                    GetLength(field="demos", to_field="recipe_metadata/num_demos")
                 )
                 self.verbalization.steps.append(
                     Set(

     demos_pool_field_name: str = constants.demos_pool_field
     demos_taken_from: str = "train"
+    demos_field: str = constants.demos_field
     sampler: Sampler = None
     # do not push demos to instances whose "demos" field is already populated
                     )
                 )
                 self.verbalization.steps.append(
+                    GetLength(field=constants.demos_field, to_field="recipe_metadata/num_demos")
                 )
                 self.verbalization.steps.append(
                     Set(

system_prompts.py CHANGED Viewed

@@ -3,7 +3,9 @@ from typing import Any, Dict, Optional
 from .dataclass import NonPositionalField
 from .operator import InstanceOperator
 class SystemPrompt(InstanceOperator):
     """The role of SystemPrompt is to add task-independent opening-text to every instance."""
@@ -14,10 +16,10 @@ class SystemPrompt(InstanceOperator):
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
         if self.skip_rendered_instance:
-            if "system_prompt" in instance:
                 return instance
-        instance["system_prompt"] = self.get_system_prompt(instance)
         return instance

 from .dataclass import NonPositionalField
 from .operator import InstanceOperator
+from .settings_utils import get_constants
+constants = get_constants()
 class SystemPrompt(InstanceOperator):
     """The role of SystemPrompt is to add task-independent opening-text to every instance."""
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
         if self.skip_rendered_instance:
+            if constants.system_prompt_field in instance:
                 return instance
+        instance[constants.system_prompt_field] = self.get_system_prompt(instance)
         return instance

task.py CHANGED Viewed

@@ -302,9 +302,15 @@ class Task(InstanceOperator, ArtifactFetcherMixin):
             "media": instance.get("media", {}),
             "recipe_metadata": instance.get("recipe_metadata", {}),
         }
-        if "demos" in instance:
             # for the case of recipe.skip_demoed_instances
-            result["demos"] = instance["demos"]
         if stream_name == constants.inference_stream:
             return result

             "media": instance.get("media", {}),
             "recipe_metadata": instance.get("recipe_metadata", {}),
         }
+        if constants.demos_field in instance:
             # for the case of recipe.skip_demoed_instances
+            result[constants.demos_field] = instance[constants.demos_field]
+        if constants.instruction_field in instance:
+            result[constants.instruction_field] = instance[constants.instruction_field]
+        if constants.system_prompt_field in instance:
+            result[constants.system_prompt_field] = instance[constants.system_prompt_field]
         if stream_name == constants.inference_stream:
             return result

templates.py CHANGED Viewed

@@ -76,9 +76,9 @@ class Template(InstanceOperator):
             self.postprocessors, List[Union[Operator, str]]
         ), f"The template post processors field '{self.postprocessors}' is not a list of processors. Instead it is of type '{to_type_string(type(self.postprocessors))}'."
-    def input_fields_to_instruction_and_target_prefix(self, input_fields):
         instruction = self.apply_formatting(
-            input_fields, "input field", self.instruction, "instruction"
         )
         target_prefix = self.apply_formatting(
             input_fields,
@@ -126,13 +126,13 @@ class Template(InstanceOperator):
         source = self.input_fields_to_source(serialized_inputs)
         instruction, target_prefix = self.input_fields_to_instruction_and_target_prefix(
-            serialized_inputs
         )
         result = {
             **instance,
             "source": source,
-            "instruction": instruction,
             "target_prefix": target_prefix,
             "postprocessors": self.postprocessors,
         }

             self.postprocessors, List[Union[Operator, str]]
         ), f"The template post processors field '{self.postprocessors}' is not a list of processors. Instead it is of type '{to_type_string(type(self.postprocessors))}'."
+    def input_fields_to_instruction_and_target_prefix(self, input_fields, instruction):
         instruction = self.apply_formatting(
+            input_fields, "input field", instruction, "instruction"
         )
         target_prefix = self.apply_formatting(
             input_fields,
         source = self.input_fields_to_source(serialized_inputs)
         instruction, target_prefix = self.input_fields_to_instruction_and_target_prefix(
+            serialized_inputs, instance.get(constants.instruction_field, self.instruction)
         )
         result = {
             **instance,
             "source": source,
+            constants.instruction_field: instruction,
             "target_prefix": target_prefix,
             "postprocessors": self.postprocessors,
         }

text_utils.py CHANGED Viewed

@@ -201,7 +201,7 @@ def construct_dict_as_yaml_lines(d, indent_delta=2) -> List[str]:
     assert (
         indent_delta >= 2
     ), f"Needs at least 2 position indentations, for the case of list elements, that are to be preceded each by ' -'. Got indent_delta={indent_delta}."
-    res = []  # conputed hereunder as a list of lines, that are indented only at the end
     if isinstance(d, dict):
         if len(d) == 0:
@@ -236,6 +236,72 @@ def construct_dict_as_yaml_lines(d, indent_delta=2) -> List[str]:
         d1 = f'"{d1}"'
     return [d1]
 def print_dict(
     d, indent=0, indent_delta=4, max_chars=None, keys_to_print=None, log_level="info"
@@ -246,11 +312,15 @@ def print_dict(
 def print_dict_as_yaml(d: dict, indent_delta=2) -> str:
-    yaml_lines = construct_dict_as_yaml_lines(d)
     # yaml_lines = [re.sub(r"(\n+)", r'"\1"', line) for line in yaml_lines]
     # yaml_lines = [line.replace("\n", "\\n") for line in yaml_lines]
     return "\n".join(yaml_lines)
 def nested_tuple_to_string(nested_tuple: tuple) -> str:
     """Converts a nested tuple to a string, with elements separated by underscores.

     assert (
         indent_delta >= 2
     ), f"Needs at least 2 position indentations, for the case of list elements, that are to be preceded each by ' -'. Got indent_delta={indent_delta}."
+    res = []  # computed hereunder as a list of lines, that are indented only at the end
     if isinstance(d, dict):
         if len(d) == 0:
         d1 = f'"{d1}"'
     return [d1]
+def construct_dict_as_python_lines(d, indent_delta=4) -> List[str]:
+    """Constructs the lines of a dictionary formatted as a piece of python code.
+    Args:
+        d: The element to be formatted.
+        indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
+    """
+    indent_delta_str = " " * indent_delta
+    res = []  # computed hereunder as a list of lines, that are indented only at the end
+    if isinstance(d, dict):
+        istype = False
+        if len(d) == 0:
+            return ["{}"]
+        if "__type__" in d:
+            istype = True
+            res = ["__type__" + d["__type__"] + "("]
+            if len(d) == 1:
+                res[0] += ")"
+                return res
+        else:
+            res = ["{"]
+        for key, val in d.items():
+            if key == "__type__":
+                continue
+            printable_key = f'"{key}"' if not istype else key
+            res.append(printable_key + ("=" if istype else ": "))
+            py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)
+            assert len(py_for_val) > 0
+            if len(py_for_val) == 1:
+                res[-1] += (py_for_val[0] +",")
+            else:
+                res[-1] += py_for_val[0]
+                if py_for_val[0].startswith("{") or py_for_val[0].startswith("["):
+                    for line in py_for_val[1:-1]:
+                        res.append(indent_delta_str + line)
+                else:
+                    # val is type, its inner lines are already indented
+                    res.extend(py_for_val[1:-1])
+                res.append(py_for_val[-1]+",")
+        res.append(")" if istype else "}")
+        if istype:
+            for i in range(1,len(res)-1):
+                res[i] = indent_delta_str+res[i]
+        return res
+    if isinstance(d, list):
+        if len(d) == 0:
+            return ["[]"]
+        res = ["["]
+        for val in d:
+            py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)
+            assert len(py_for_val) > 0
+            for line in py_for_val[:-1]:
+                res.append(line)
+            res.append(py_for_val[-1] + ",")
+        res.append("]")
+        return res
+    # d1 = re.sub(r"(\n+)", r'"\1"', str(d))
+    if isinstance(d, str):
+        return [f'"{d}"']
+    if d is None or isinstance (d, (int, float, bool)):
+        return [f"{d}"]
+    raise RuntimeError(f"unrecognized value to print as python: {d}")
 def print_dict(
     d, indent=0, indent_delta=4, max_chars=None, keys_to_print=None, log_level="info"
 def print_dict_as_yaml(d: dict, indent_delta=2) -> str:
+    yaml_lines = construct_dict_as_yaml_lines(d, indent_delta=indent_delta)
     # yaml_lines = [re.sub(r"(\n+)", r'"\1"', line) for line in yaml_lines]
     # yaml_lines = [line.replace("\n", "\\n") for line in yaml_lines]
     return "\n".join(yaml_lines)
+def print_dict_as_python(d: dict, indent_delta=4) -> str:
+    py_lines = construct_dict_as_python_lines(d, indent_delta=indent_delta)
+    assert len(py_lines)> 0
+    return "\n".join(py_lines)
 def nested_tuple_to_string(nested_tuple: tuple) -> str:
     """Converts a nested tuple to a string, with elements separated by underscores.

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.19.0"


1	+ version = "1.20.0"