File size: 4,477 Bytes
d891c3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import datasets
import evaluate
import numpy as np

_CITATION = """\
@inproceedings{palotti2019,
 author = {Palotti, Joao and Scells, Harrisen and Zuccon, Guido},
 title = {TrecTools: an open-source Python library for Information Retrieval practitioners involved in TREC-like campaigns},
 series = {SIGIR'19},
 year = {2019},
 location = {Paris, France},
 publisher = {ACM}
}
"""

_DESCRIPTION = """\
A metric to evaluate ranking tasks using the TREC evaluation tool. It compares predicted rankings of items (e.g., documents) with their true relevance scores. The metric takes two inputs: references (true relevance scores) and predictions (predicted scores), both as lists of lists, where each (i, j) is the truth or the predicted score of the document j in the query i. In a nutshell: simplifies the usage of TREC to compute ranking metrics given scores per sample.
"""

_KWARGS_DESCRIPTION = """ Computes MAP, P@K, RR, and NDCG using the TREC evaluation tool.

Args:
    references (list(list(float))): true scores for each query
    predictions (list(list(float))): pred scores for each query

Returns:
    Dict: the set of TREC's metrics scores

Example:
    # (i, j) means the truth/predicted score of the document j in the query i
    references = [[5, 0, 3, 0, 0, 2, 1],
                    [5, 0, 3, 0, 0, 2, 1],
                    [5, 0, 3, 0, 0, 2, 1],
                    [0, 1, 2]]

    predictions = [[3, 4, 2, 0, 1, 5, 0],
                    [2, 0, 4, 5, 0, 1, 3],
                    [0, 3, 2, 1, 5, 0, 4],
                    [5, 3, 2]]

    metric = evaluate.load("symanto/ranking_evaluator")
    metric.compute(references=references, predictions=predictions)
"""


class RankingEvaluator(evaluate.Metric):
    def _info(self):
        return evaluate.MetricInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features(
                {
                    "predictions": datasets.Sequence(datasets.Value("float32")),
                    "references": datasets.Sequence(datasets.Value("float32")),
                }
            ),
        )

    def _download_and_prepare(self, dl_manager):
        self.trec_eval = evaluate.load("trec_eval")

    def _compute(
        self, references: list[list[float]], predictions: list[list[float]]
    ) -> dict:
        """
        Calculates MAP, P@K, RR, and NDCG using the TREC evaluation tool.

        Args:
            references (list(list(float))): true scores for each query
            predictions (list(list(float))): pred scores for each query

        Returns:
            Dict: the set of TREC's metrics scores

        Example:
            # (i, j) means the truth/predicted score of the document j in the query i
            references = [[5, 0, 3, 0, 0, 2, 1],
                          [5, 0, 3, 0, 0, 2, 1],
                          [5, 0, 3, 0, 0, 2, 1],
                          [0, 1, 2]]

            predictions = [[3, 4, 2, 0, 1, 5, 0],
                           [2, 0, 4, 5, 0, 1, 3],
                           [0, 3, 2, 1, 5, 0, 4],
                           [5, 3, 2]]

            metric = evaluate.load("symanto/ranking_evaluator")
            metric.compute(references=references, predictions=predictions)
        """
        qrel = {}
        run = {}

        # Fill qrel
        for query_idx, truth in enumerate(references):
            for item_idx, relevance in enumerate(truth):
                if relevance > 0:
                    qrel.setdefault("query", []).append(query_idx)
                    qrel.setdefault("q0", []).append("q0")
                    qrel.setdefault("docid", []).append(f"doc_{item_idx}")
                    qrel.setdefault("rel", []).append(relevance)

        # Fill run
        for query_idx, pred in enumerate(predictions):
            ranking = np.argsort(np.argsort(pred)[::-1])
            for item_idx, score in enumerate(pred):
                if score > 0:
                    run.setdefault("query", []).append(query_idx)
                    run.setdefault("q0", []).append("q0")
                    run.setdefault("docid", []).append(f"doc_{item_idx}")
                    run.setdefault("score", []).append(score)
                    run.setdefault("system", []).append("sys")
                    run.setdefault("rank", []).append(ranking[item_idx])

        return self.trec_eval.compute(references=[qrel], predictions=[run])