File size: 6,592 Bytes
ff375eb 59be457 4868000 59be457 f8c1b22 a350a45 4868000 92d8b2d 59be457 92d8b2d 59be457 4868000 59be457 a8f310f 4868000 a8f310f 59be457 a8f310f 0f18707 a8f310f ff375eb 59be457 ff375eb 59be457 a350a45 59be457 ff375eb 59be457 a8f310f 59be457 8d5bd0c 4868000 8d5bd0c a8f310f 8d5bd0c a8f310f 8d5bd0c a8f310f 8d5bd0c a8f310f 8d5bd0c a8f310f 8d5bd0c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
from functools import lru_cache
from typing import Any, Dict, List, Optional, Union
from .artifact import fetch_artifact
from .logging_utils import get_logger
from .operator import StreamInstanceOperator
from .type_utils import (
get_args,
get_origin,
isoftype,
parse_type_string,
verify_required_schema,
)
class Tasker:
pass
class FormTask(Tasker, StreamInstanceOperator):
"""FormTask packs the different instance fields into dictionaries by their roles in the task.
Attributes:
inputs (Union[Dict[str, str], List[str]]):
Dictionary with string names of instance input fields and types of respective values.
In case a list is passed, each type will be assumed to be Any.
outputs (Union[Dict[str, str], List[str]]):
Dictionary with string names of instance output fields and types of respective values.
In case a list is passed, each type will be assumed to be Any.
metrics (List[str]): List of names of metrics to be used in the task.
prediction_type (Optional[str]):
Need to be consistent with all used metrics. Defaults to None, which means that it will
be set to Any.
The output instance contains three fields:
"inputs" whose value is a sub-dictionary of the input instance, consisting of all the fields listed in Arg 'inputs'.
"outputs" -- for the fields listed in Arg "outputs".
"metrics" -- to contain the value of Arg 'metrics'
"""
inputs: Union[Dict[str, str], List[str]]
outputs: Union[Dict[str, str], List[str]]
metrics: List[str]
prediction_type: Optional[str] = None
augmentable_inputs: List[str] = []
def verify(self):
for io_type in ["inputs", "outputs"]:
data = self.inputs if io_type == "inputs" else self.outputs
if not isoftype(data, Dict[str, str]):
get_logger().warning(
f"'{io_type}' field of Task should be a dictionary of field names and their types. "
f"For example, {{'text': 'str', 'classes': 'List[str]'}}. Instead only '{data}' was "
f"passed. All types will be assumed to be 'Any'. In future version of unitxt this "
f"will raise an exception."
)
data = {key: "Any" for key in data}
if io_type == "inputs":
self.inputs = data
else:
self.outputs = data
if not self.prediction_type:
get_logger().warning(
"'prediction_type' was not set in Task. It is used to check the output of "
"template post processors is compatible with the expected input of the metrics. "
"Setting `prediction_type` to 'Any' (no checking is done). In future version "
"of unitxt this will raise an exception."
)
self.prediction_type = "Any"
self.check_metrics_type()
for augmentable_input in self.augmentable_inputs:
assert (
augmentable_input in self.inputs
), f"augmentable_input {augmentable_input} is not part of {self.inputs}"
@staticmethod
@lru_cache(maxsize=None)
def get_metric_prediction_type(metric_id: str):
metric = fetch_artifact(metric_id)[0]
return metric.get_prediction_type()
def check_metrics_type(self) -> None:
prediction_type = parse_type_string(self.prediction_type)
for metric_id in self.metrics:
metric_prediction_type = FormTask.get_metric_prediction_type(metric_id)
if (
prediction_type == metric_prediction_type
or prediction_type == Any
or metric_prediction_type == Any
or (
get_origin(metric_prediction_type) is Union
and prediction_type in get_args(metric_prediction_type)
)
):
continue
raise ValueError(
f"The task's prediction type ({prediction_type}) and '{metric_id}' "
f"metric's prediction type ({metric_prediction_type}) are different."
)
def process(
self, instance: Dict[str, Any], stream_name: Optional[str] = None
) -> Dict[str, Any]:
verify_required_schema(self.inputs, instance)
verify_required_schema(self.outputs, instance)
inputs = {key: instance[key] for key in self.inputs.keys()}
outputs = {key: instance[key] for key in self.outputs.keys()}
return {
"inputs": inputs,
"outputs": outputs,
"metrics": self.metrics,
}
class MultipleChoiceTask(FormTask):
choices_field: str = "choices"
choices_separator: str = "\n"
enumeration_suffix: str = ". "
use_text_in_target: bool = False
alphabet: str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
def process_single_choice(
self, choice: str, index: int, use_text: bool = True
) -> str:
try:
processed_choice = f"{self.alphabet[index]}"
except IndexError as e:
raise ValueError(
f"Too many choices, the length of alphabet '{self.alphabet}': {len(self.alphabet)} is the limit"
) from e
if use_text:
processed_choice += f"{self.enumeration_suffix}{choice}"
return processed_choice
def process_choices(self, choices: List[str]) -> str:
processed_choices = []
for index, choice in enumerate(choices):
processed_choices.append(self.process_single_choice(choice, index))
return self.choices_separator.join(processed_choices)
def process_target(self, choices, target_index):
return self.process_single_choice(
choices[target_index], target_index, use_text=self.use_text_in_target
)
def process(
self, instance: Dict[str, Any], stream_name: Optional[str] = None
) -> Dict[str, Any]:
result = super().process(instance, stream_name)
target_key, target_value = next(iter(result["outputs"].items()))
choices = result["inputs"][self.choices_field]
target_index_in_choices = choices.index(target_value)
processed_choices = self.process_choices(choices)
processed_target = self.process_target(choices, target_index_in_choices)
result["inputs"][self.choices_field] = processed_choices
result["outputs"][target_key] = processed_target
return result
|