File size: 2,916 Bytes
afa91bc
 
 
 
ab21cd8
afa91bc
 
 
 
 
 
 
 
 
 
 
 
 
 
ab21cd8
afa91bc
 
ab21cd8
afa91bc
ab21cd8
 
 
 
 
afa91bc
ab21cd8
afa91bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81704c2
afa91bc
 
7c54d17
 
81704c2
7c54d17
 
afa91bc
 
 
 
 
ab21cd8
 
 
 
 
afa91bc
1247458
 
ab21cd8
53ccfce
ab21cd8
59d0cdb
 
ab21cd8
 
 
 
afa91bc
ab21cd8
 
 
 
afa91bc
 
ab21cd8
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from typing import Dict, Any

import datasets
import evaluate
import numpy as np
from evaluate.utils.file_utils import add_start_docstrings

_DESCRIPTION = """
The "top-5 error" is the percentage of times that the target label does not appear among the 5 highest-probability predictions. It can be computed with:
Top-5 Error Rate = 1 - Top-5 Accuracy
or equivalently:
Top-5 Error Rate = (Number of incorrect top-5 predictions) / (Total number of cases processed)
 Where:
- Top-5 Accuracy: The proportion of cases where the true label is among the model's top 5 predicted classes.
- Incorrect top-5 prediction: The true label is not in the top 5 predicted classes (ranked by probability).
"""

_KWARGS_DESCRIPTION = """
Args:
    predictions (`list` of `list` of `int`): Predicted labels. Each inner list should contain the top-5 predicted class indices.
    references (`list` of `int`): Ground truth labels.
Returns:
    top5_error_rate (`float`): Top-5 Error Rate score. Minimum possible value is 0. Maximum possible value is 1.0.
Examples:
    >>> metric = evaluate.load("top5_error_rate")
    >>> results = metric.compute(
    ...     references=[0, 1, 2], 
    ...     predictions=[[0, 1, 2, 3, 4], [1, 0, 2, 3, 4], [2, 0, 1, 3, 4]]
    ... )
    >>> print(results)
    {'top5_error_rate': 0.0}
"""

_CITATION = """
"""


@add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Top5ErrorRate(evaluate.Metric):
    def _info(self):
        return evaluate.MetricInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features(
                {
                    "predictions": datasets.Sequence(datasets.Value("float32")),
                    "references": datasets.Sequence(datasets.Value("int32")),
                }
                if self.config_name == "multilabel"
                else {
                    "predictions": datasets.Value("float32"),
                    "references": datasets.Value("int32"),
                }
            ),
            reference_urls=[],
        )

    def _compute(
            self,
            *,
            predictions: list[list[float]] = None,
            references: list[int] = None,
            **kwargs,
    ) -> Dict[str, Any]:
        print(predictions)
        print(references)
        # to numpy array
        outputs = np.array(predictions, dtype=np.float32)
        labels = np.array(references)
        print(outputs)
        print(labels)

        # Top-1 ACC
        pred = outputs.argmax(axis=1)
        acc = (pred == labels).mean()

        # Top-5 Error Rate
        top5_indices = outputs.argsort(axis=1)[:, -5:]
        correct = (labels.reshape(-1, 1) == top5_indices).any(axis=1)
        top5_error_rate = 1 - correct.mean()

        return {
            "accuracy": acc,
            "top5_error_rate": top5_error_rate
        }