Spaces:

SUSTech
/

tlem

Running

File size: 3,873 Bytes

# %%

try:
    from ipytorch import logging
except Exception as e:
    import logging

from typing import Any, Optional, Protocol, Iterable, Callable
from tqdm.auto import tqdm
from evaluate.evaluation_suite import EvaluationSuite
import evaluate
import numpy as np
import datasets
from .tasks import Task, Metrics
from .utils import is_equiv

# %%

# %cd ../tlem

# %load_ext ipytorch
# %ls


# TODO: Add BibTeX citation
_CITATION = """\
@InProceedings{huggingface:module,
title = {A great new module},
authors={huggingface, Inc.},
year={2020}
}
"""

# TODO: Add description of the module here
_DESCRIPTION = """\
A simple measurement that returns the number of elements in dataset.
"""


# TODO: Add description of the arguments of the module here
_KWARGS_DESCRIPTION = """
Calculates number of elements in dataset
Args:
    data: list of elements.
Returns:
    element_count: number of elements in dataset,
Examples:
    >>> measure = evaluate.load("lvwerra/element_count")
    >>> measure.compute(["a", "b", "c")
    {"element_count": 3}
"""

# TODO: Define external resources urls if needed
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class ReasoningMetric(evaluate.Metric):
    """TODO: Short description of my evaluation module."""

    def _info(self):
        features = datasets.Features(
            {
                "responses": datasets.Value("string"),
                "references": datasets.Value("string"),
            }
        )

        if self.config_name == "svamp":
            features = datasets.Features(
                {
                    "responses": datasets.Value("string"),
                    "references": datasets.Value("float"),
                }
            )

        # TODO: Specifies the evaluate.EvaluationModuleInfo object
        return evaluate.EvaluationModuleInfo(
            # This is the description that will appear on the modules page.
            # module_type="measurement",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            # This defines the format of each prediction and reference
            features=features,
            # Homepage of the module for documentation
            homepage="http://module.homepage",
            # Additional links to the codebase or references
            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
            reference_urls=["http://path.to.reference.url/new_module"],
        )

    def _compute(self, responses, references, verbose=False):
        results = {}
        scores = getattr(Metrics, self.config_name)(responses, references)
        acc = np.asarray(scores).mean()
        results = {
            "accuracy": acc,
            "scores": scores,
        }

        if verbose:
            results["references"] = references
            results["answers"] = responses
            # results["scores"] = scores

        return results


class Suite(EvaluationSuite):
    def run(
        self, model_or_pipeline: Any, prompt: str = "{instruction}"
    ) -> dict[str, float]:
        self.assert_suite_nonempty()

        results_all = {}
        for task in tqdm(self.suite, desc="Running tasks"):
            task_name = task.name
            results = task.run(model_or_pipeline)
            results_all[task_name] = results
        return results_all

    def __init__(self, name):
        super().__init__(name)

        self.suite = [
            Task(
                dataset_name=("gsm8k", "main"),
                metric_name=("sustech/tlem", "gsm8k"),
                input_column="question",
                label_column="answer",
            )
            # TASK_REGISTRY["gsm8k"],
            # TASK_REGISTRY["competition_math"],
        ]


# %%