Spaces:

symanto
/

ranking_evaluator

Sleeping

File size: 4,477 Bytes

d891c3c

import datasets
import evaluate
import numpy as np

_CITATION = """\
@inproceedings{palotti2019,
 author = {Palotti, Joao and Scells, Harrisen and Zuccon, Guido},
 title = {TrecTools: an open-source Python library for Information Retrieval practitioners involved in TREC-like campaigns},
 series = {SIGIR'19},
 year = {2019},
 location = {Paris, France},
 publisher = {ACM}
}
"""

_DESCRIPTION = """\
A metric to evaluate ranking tasks using the TREC evaluation tool. It compares predicted rankings of items (e.g., documents) with their true relevance scores. The metric takes two inputs: references (true relevance scores) and predictions (predicted scores), both as lists of lists, where each (i, j) is the truth or the predicted score of the document j in the query i. In a nutshell: simplifies the usage of TREC to compute ranking metrics given scores per sample.
"""

_KWARGS_DESCRIPTION = """ Computes MAP, P@K, RR, and NDCG using the TREC evaluation tool.

Args:
    references (list(list(float))): true scores for each query
    predictions (list(list(float))): pred scores for each query

Returns:
    Dict: the set of TREC's metrics scores

Example:
    # (i, j) means the truth/predicted score of the document j in the query i
    references = [[5, 0, 3, 0, 0, 2, 1],
                    [5, 0, 3, 0, 0, 2, 1],
                    [5, 0, 3, 0, 0, 2, 1],
                    [0, 1, 2]]

    predictions = [[3, 4, 2, 0, 1, 5, 0],
                    [2, 0, 4, 5, 0, 1, 3],
                    [0, 3, 2, 1, 5, 0, 4],
                    [5, 3, 2]]

    metric = evaluate.load("symanto/ranking_evaluator")
    metric.compute(references=references, predictions=predictions)
"""


class RankingEvaluator(evaluate.Metric):
    def _info(self):
        return evaluate.MetricInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features(
                {
                    "predictions": datasets.Sequence(datasets.Value("float32")),
                    "references": datasets.Sequence(datasets.Value("float32")),
                }
            ),
        )

    def _download_and_prepare(self, dl_manager):
        self.trec_eval = evaluate.load("trec_eval")

    def _compute(
        self, references: list[list[float]], predictions: list[list[float]]
    ) -> dict:
        """
        Calculates MAP, P@K, RR, and NDCG using the TREC evaluation tool.

        Args:
            references (list(list(float))): true scores for each query
            predictions (list(list(float))): pred scores for each query

        Returns:
            Dict: the set of TREC's metrics scores

        Example:
            # (i, j) means the truth/predicted score of the document j in the query i
            references = [[5, 0, 3, 0, 0, 2, 1],
                          [5, 0, 3, 0, 0, 2, 1],
                          [5, 0, 3, 0, 0, 2, 1],
                          [0, 1, 2]]

            predictions = [[3, 4, 2, 0, 1, 5, 0],
                           [2, 0, 4, 5, 0, 1, 3],
                           [0, 3, 2, 1, 5, 0, 4],
                           [5, 3, 2]]

            metric = evaluate.load("symanto/ranking_evaluator")
            metric.compute(references=references, predictions=predictions)
        """
        qrel = {}
        run = {}

        # Fill qrel
        for query_idx, truth in enumerate(references):
            for item_idx, relevance in enumerate(truth):
                if relevance > 0:
                    qrel.setdefault("query", []).append(query_idx)
                    qrel.setdefault("q0", []).append("q0")
                    qrel.setdefault("docid", []).append(f"doc_{item_idx}")
                    qrel.setdefault("rel", []).append(relevance)

        # Fill run
        for query_idx, pred in enumerate(predictions):
            ranking = np.argsort(np.argsort(pred)[::-1])
            for item_idx, score in enumerate(pred):
                if score > 0:
                    run.setdefault("query", []).append(query_idx)
                    run.setdefault("q0", []).append("q0")
                    run.setdefault("docid", []).append(f"doc_{item_idx}")
                    run.setdefault("score", []).append(score)
                    run.setdefault("system", []).append("sys")
                    run.setdefault("rank", []).append(ranking[item_idx])

        return self.trec_eval.compute(references=[qrel], predictions=[run])