Upload NLIScorer
Browse files- pipeline.py +36 -7
pipeline.py
CHANGED
@@ -330,28 +330,57 @@ TASK_CLASSES = {
|
|
330 |
"Quality/Response/Verbosity": QualityResponseVerbosity,
|
331 |
}
|
332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
|
334 |
class NLIScorer(Pipeline):
|
335 |
def _sanitize_parameters(self, **kwargs):
|
336 |
preprocess_kwargs = {}
|
337 |
postprocess_kwargs = {}
|
338 |
-
if "threshold" in kwargs:
|
339 |
-
postprocess_kwargs["threshold"] = kwargs["threshold"]
|
340 |
return preprocess_kwargs, {}, postprocess_kwargs
|
341 |
|
342 |
def preprocess(self, inputs):
|
343 |
task_name = inputs.pop("task_type")
|
344 |
TaskClass = TASK_CLASSES[task_name]
|
345 |
task_class = TaskClass(tokenizer=self.tokenizer, **inputs)
|
346 |
-
return
|
|
|
|
|
|
|
347 |
|
348 |
def _forward(self, model_inputs):
|
349 |
-
outputs = self.model(**model_inputs)
|
350 |
-
return outputs
|
351 |
|
352 |
-
def postprocess(self, model_outputs
|
353 |
pos_scores = model_outputs["logits"].softmax(-1)[0][1]
|
354 |
-
best_class = int(pos_scores > threshold)
|
355 |
if best_class == 1:
|
356 |
score = pos_scores
|
357 |
else:
|
|
|
330 |
"Quality/Response/Verbosity": QualityResponseVerbosity,
|
331 |
}
|
332 |
|
333 |
+
TASK_THRESHOLDS = {
|
334 |
+
"Detection/Hallucination/Factual Consistency": 0.5895,
|
335 |
+
"Detection/Prompt Injection": 0.4147,
|
336 |
+
"Detection/Source Code": 0.4001,
|
337 |
+
"Detection/Toxicity/Disability": 0.5547,
|
338 |
+
"Detection/Toxicity/Gender": 0.4007,
|
339 |
+
"Detection/Toxicity/Identity Hate": 0.5502,
|
340 |
+
"Detection/Toxicity/Insult": 0.4913,
|
341 |
+
"Detection/Toxicity/Obscene": 0.448,
|
342 |
+
"Detection/Toxicity/Race": 0.5983,
|
343 |
+
"Detection/Toxicity/Religion": 0.4594,
|
344 |
+
"Detection/Toxicity/Toxic": 0.5034,
|
345 |
+
"Detection/Toxicity/Violence": 0.4031,
|
346 |
+
"Quality/Context/Document Relevance": 0.5809,
|
347 |
+
"Quality/Context/Document Utilization": 0.4005,
|
348 |
+
"Quality/Context/Sentence Relevance": 0.6003,
|
349 |
+
"Quality/Context/Sentence Utilization": 0.5417,
|
350 |
+
"Quality/Response/Adherence": 0.59,
|
351 |
+
"Quality/Response/Attribution": 0.5304,
|
352 |
+
"Quality/Response/Coherence": 0.6891,
|
353 |
+
"Quality/Response/Complexity": 0.7235,
|
354 |
+
"Quality/Response/Correctness": 0.6535,
|
355 |
+
"Quality/Response/Helpfulness": 0.4445,
|
356 |
+
"Quality/Response/Instruction Following": 0.5323,
|
357 |
+
"Quality/Response/Relevance": 0.4011,
|
358 |
+
"Quality/Response/Verbosity": 0.4243,
|
359 |
+
}
|
360 |
+
|
361 |
|
362 |
class NLIScorer(Pipeline):
|
363 |
def _sanitize_parameters(self, **kwargs):
|
364 |
preprocess_kwargs = {}
|
365 |
postprocess_kwargs = {}
|
|
|
|
|
366 |
return preprocess_kwargs, {}, postprocess_kwargs
|
367 |
|
368 |
def preprocess(self, inputs):
|
369 |
task_name = inputs.pop("task_type")
|
370 |
TaskClass = TASK_CLASSES[task_name]
|
371 |
task_class = TaskClass(tokenizer=self.tokenizer, **inputs)
|
372 |
+
return {
|
373 |
+
"model_inputs": task_class.as_model_inputs,
|
374 |
+
"threshold": TASK_THRESHOLDS[task_name],
|
375 |
+
}
|
376 |
|
377 |
def _forward(self, model_inputs):
|
378 |
+
outputs = self.model(**model_inputs["model_inputs"])
|
379 |
+
return {"logits": outputs["logits"], "threshold": model_inputs["threshold"]}
|
380 |
|
381 |
+
def postprocess(self, model_outputs):
|
382 |
pos_scores = model_outputs["logits"].softmax(-1)[0][1]
|
383 |
+
best_class = int(pos_scores > model_outputs["threshold"])
|
384 |
if best_class == 1:
|
385 |
score = pos_scores
|
386 |
else:
|