# source: https://github.com/mponty/bigcode-dataset/tree/main/pii/ner_model_training/utils by @mponty
import numpy as np
from evaluate import load
from scipy.special import softmax
from sklearn.metrics import average_precision_score

_seqeval_metric = load("seqeval")


# NER tags
CATEGORIES = [
    "NAME",
    "EMAIL",
    "EMAIL_EXAMPLE",
    "USERNAME",
    "KEY",
    "IP_ADDRESS",
    "PASSWORD",
]
IGNORE_CLASS = ["AMBIGUOUS", "ID", "NAME_EXAMPLE", "USERNAME_EXAMPLE"]

LABEL2ID = {"O": 0}
for cat in CATEGORIES:
    LABEL2ID[f"B-{cat}"] = len(LABEL2ID)
    LABEL2ID[f"I-{cat}"] = len(LABEL2ID)
ID2LABEL = {v: k for k, v in LABEL2ID.items()}


def compute_ap(pred, truth):
    pred_proba = 1 - softmax(pred, axis=-1)[..., 0]
    pred_proba, truth = pred_proba.flatten(), np.array(truth).flatten()
    pred_proba = pred_proba[truth != -100]
    truth = truth[truth != -100]

    return average_precision_score(truth != 0, pred_proba)


def compute_metrics(p):
    predictions, labels = p
    avg_prec = compute_ap(predictions, labels)
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [ID2LABEL[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [ID2LABEL[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = _seqeval_metric.compute(
        predictions=true_predictions, references=true_labels, zero_division=0,
    )
    agg_metrics = {
        "Avg.Precision": avg_prec,
        "precision": results.pop("overall_precision"),
        "recall": results.pop("overall_recall"),
        "f1": results.pop("overall_f1"),
    }
    results.pop("overall_accuracy")
    per_cat_metrics = {name: metrics["f1"] for name, metrics in results.items()}

    return dict(**agg_metrics, **per_cat_metrics)