# %% try: from ipytorch import logging except Exception as e: import logging from typing import Any, Optional, Protocol, Iterable, Callable # %% # %cd ../tlem # %load_ext ipytorch # %ls from utils import ( NUMERIC_IN_ZH, extract_choice_ans, extract_numeric, get_answer, is_equiv, ) from dataclasses import dataclass, field from datasets import load_dataset, Dataset from functools import cached_property TextGenerationPipeline = Callable[[Iterable[str]], list[str]] from evaluate import EvaluationModule, Evaluator, evaluator, load @dataclass class Task: dataset_name: str = "gsm8k" dataset_params: dict = field(default_factory=dict) # metrics: list[str] = field(default_factory=list) metric_name: str | tuple[str, str] = "gsm8k" input_column: str = "question" label_column: str prompt: Optional[Callable | str] = None @cached_property def samples(self): return self.dataset[self.input_column] @cached_property def dataset(self): ds = load_dataset(self.dataset_name, **self.dataset_params) if self.prompt is not None: ds = ds.map( lambda example: { self.input_column: self.prompt.format( input_column=example[self.input_column] ) } if isinstance(self.prompt, str) else self.prompt(example), ) return ds @cached_property def metric(self): metric = ( load(self.metric_name) if isinstance(self.metric_name, str) else load(*self.metric_name) ) return metric def run(self, pipeline: TextGenerationPipeline): outputs = pipeline(self.samples) return self.metric.compute(outputs, self.dataset[self.label_column]) class Metrics: def gsm8k(responses: list[str], answers: list[str | int]): scores = [] for response, answer in zip(responses, answers): pred = extract_numeric(response) gold = extract_numeric(answer) if isinstance(answer, str) else str(answer) scores.append(1.0 * (pred == gold)) return scores def MATH(responses: list[str], answers: list[str]): scores = [] for response, answer in zip(responses, answers): indices = [pos for pos, char in enumerate(response) if char == "$"] if len(indices) <= 2: scores.append(0) continue else: result = response[indices[-2] + 1 : indices[-1]] gold = get_answer(answer) scores.append(1.0 * is_equiv(result, gold)) return scores def math23k(responses: list[str], answers: list[str]): scores = [] for response, answer in zip(responses, answers): pred = extract_numeric(response, pattern=NUMERIC_IN_ZH) gold = extract_numeric(answer, pattern=NUMERIC_IN_ZH) scores.append(1.0 * (pred == gold)) return scores def gsm8k_zh(responses: list[str], answers: list[str]): scores = [] for response, answer in zip(responses, answers): pred = extract_numeric(response, pattern=NUMERIC_IN_ZH) gold = extract_numeric(answer) scores.append(1.0 * (pred == gold)) return scores def svamp(responses: list[float], answers: list[str]): scores = [] for response, answer in zip(responses, answers): pred = extract_numeric(response, pattern=NUMERIC_IN_ZH) gold = answer scores.append(1.0 * (float(pred) == gold)) return scores def mmlu(responses, answers): scores = [] for response, answer in zip(responses, answers): pred = extract_choice_ans(response) gold = answer.lower() scores.append(1.0 * (pred == gold)) return scores import evaluate import numpy as np import datasets # TODO: Add BibTeX citation _CITATION = """\ @InProceedings{huggingface:module, title = {A great new module}, authors={huggingface, Inc.}, year={2020} } """ # TODO: Add description of the module here _DESCRIPTION = """\ A simple measurement that returns the number of elements in dataset. """ # TODO: Add description of the arguments of the module here _KWARGS_DESCRIPTION = """ Calculates number of elements in dataset Args: data: list of elements. Returns: element_count: number of elements in dataset, Examples: >>> measure = evaluate.load("lvwerra/element_count") >>> measure.compute(["a", "b", "c") {"element_count": 3} """ # TODO: Define external resources urls if needed BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt" @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class ReasoningMetric(evaluate.Metric): """TODO: Short description of my evaluation module.""" def _info(self): features = datasets.Features( { "responses": datasets.Value("string"), "references": datasets.Value("string"), } ) if self.config_name == "svamp": features = datasets.Features( { "responses": datasets.Value("string"), "references": datasets.Value("float"), } ) # TODO: Specifies the evaluate.EvaluationModuleInfo object return evaluate.EvaluationModuleInfo( # This is the description that will appear on the modules page. # module_type="measurement", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=features, # Homepage of the module for documentation homepage="http://module.homepage", # Additional links to the codebase or references codebase_urls=["http://github.com/path/to/codebase/of/new_module"], reference_urls=["http://path.to.reference.url/new_module"], ) def _compute(self, responses, references, verbose=False): results = {} scores = getattr(Metrics, self.config_name)(responses, references) acc = np.asarray(scores).mean() results = { "accuracy": acc, "scores": scores, } if verbose: results["references"] = references results["answers"] = responses # results["scores"] = scores return results