Spaces:
Running
Running
from typing import Optional | |
import numpy as np | |
import weave | |
class AccuracyMetric(weave.Scorer): | |
def score(self, output: dict, label: int): | |
return {"correct": bool(label) == output["safe"]} | |
def summarize(self, score_rows: list) -> Optional[dict]: | |
valid_data = [ | |
x.get("correct") for x in score_rows if x.get("correct") is not None | |
] | |
count_true = list(valid_data).count(True) | |
int_data = [int(x) for x in valid_data] | |
sample_mean = np.mean(int_data) if int_data else 0 | |
sample_variance = np.var(int_data) if int_data else 0 | |
sample_error = np.sqrt(sample_variance / len(int_data)) if int_data else 0 | |
# Calculate precision, recall, and F1 score | |
true_positives = count_true | |
false_positives = len(valid_data) - count_true | |
false_negatives = len(score_rows) - len(valid_data) | |
precision = ( | |
true_positives / (true_positives + false_positives) | |
if (true_positives + false_positives) > 0 | |
else 0 | |
) | |
recall = ( | |
true_positives / (true_positives + false_negatives) | |
if (true_positives + false_negatives) > 0 | |
else 0 | |
) | |
f1_score = ( | |
(2 * precision * recall) / (precision + recall) | |
if (precision + recall) > 0 | |
else 0 | |
) | |
return { | |
"correct": { | |
"true_count": count_true, | |
"false_count": len(score_rows) - count_true, | |
"true_fraction": float(sample_mean), | |
"false_fraction": 1.0 - float(sample_mean), | |
"stderr": float(sample_error), | |
"precision": precision, | |
"recall": recall, | |
"f1_score": f1_score, | |
} | |
} | |