Spaces:
Running
Running
# %% | |
try: | |
from ipytorch import logging | |
except Exception as e: | |
import logging | |
from typing import Any, Optional, Protocol, Iterable, Callable | |
# %% | |
# %cd ../tlem | |
# %load_ext ipytorch | |
# %ls | |
from utils import ( | |
NUMERIC_IN_ZH, | |
extract_choice_ans, | |
extract_numeric, | |
get_answer, | |
is_equiv, | |
) | |
from dataclasses import dataclass, field | |
from datasets import load_dataset, Dataset | |
from functools import cached_property | |
TextGenerationPipeline = Callable[[Iterable[str]], list[str]] | |
from evaluate import EvaluationModule, Evaluator, evaluator, load | |
class Task: | |
dataset_name: str = "gsm8k" | |
dataset_params: dict = field(default_factory=dict) | |
# metrics: list[str] = field(default_factory=list) | |
metric_name: str | tuple[str, str] = "gsm8k" | |
input_column: str = "question" | |
label_column: str | |
prompt: Optional[Callable | str] = None | |
def samples(self): | |
return self.dataset[self.input_column] | |
def dataset(self): | |
ds = load_dataset(self.dataset_name, **self.dataset_params) | |
if self.prompt is not None: | |
ds = ds.map( | |
lambda example: { | |
self.input_column: self.prompt.format( | |
input_column=example[self.input_column] | |
) | |
} | |
if isinstance(self.prompt, str) | |
else self.prompt(example), | |
) | |
return ds | |
def metric(self): | |
metric = ( | |
load(self.metric_name) | |
if isinstance(self.metric_name, str) | |
else load(*self.metric_name) | |
) | |
return metric | |
def run(self, pipeline: TextGenerationPipeline): | |
outputs = pipeline(self.samples) | |
return self.metric.compute(outputs, self.dataset[self.label_column]) | |
class Metrics: | |
def gsm8k(responses: list[str], answers: list[str | int]): | |
scores = [] | |
for response, answer in zip(responses, answers): | |
pred = extract_numeric(response) | |
gold = extract_numeric(answer) if isinstance(answer, str) else str(answer) | |
scores.append(1.0 * (pred == gold)) | |
return scores | |
def MATH(responses: list[str], answers: list[str]): | |
scores = [] | |
for response, answer in zip(responses, answers): | |
indices = [pos for pos, char in enumerate(response) if char == "$"] | |
if len(indices) <= 2: | |
scores.append(0) | |
continue | |
else: | |
result = response[indices[-2] + 1 : indices[-1]] | |
gold = get_answer(answer) | |
scores.append(1.0 * is_equiv(result, gold)) | |
return scores | |
def math23k(responses: list[str], answers: list[str]): | |
scores = [] | |
for response, answer in zip(responses, answers): | |
pred = extract_numeric(response, pattern=NUMERIC_IN_ZH) | |
gold = extract_numeric(answer, pattern=NUMERIC_IN_ZH) | |
scores.append(1.0 * (pred == gold)) | |
return scores | |
def gsm8k_zh(responses: list[str], answers: list[str]): | |
scores = [] | |
for response, answer in zip(responses, answers): | |
pred = extract_numeric(response, pattern=NUMERIC_IN_ZH) | |
gold = extract_numeric(answer) | |
scores.append(1.0 * (pred == gold)) | |
return scores | |
def svamp(responses: list[float], answers: list[str]): | |
scores = [] | |
for response, answer in zip(responses, answers): | |
pred = extract_numeric(response, pattern=NUMERIC_IN_ZH) | |
gold = answer | |
scores.append(1.0 * (float(pred) == gold)) | |
return scores | |
def mmlu(responses, answers): | |
scores = [] | |
for response, answer in zip(responses, answers): | |
pred = extract_choice_ans(response) | |
gold = answer.lower() | |
scores.append(1.0 * (pred == gold)) | |
return scores | |
import evaluate | |
import numpy as np | |
import datasets | |
# TODO: Add BibTeX citation | |
_CITATION = """\ | |
@InProceedings{huggingface:module, | |
title = {A great new module}, | |
authors={huggingface, Inc.}, | |
year={2020} | |
} | |
""" | |
# TODO: Add description of the module here | |
_DESCRIPTION = """\ | |
A simple measurement that returns the number of elements in dataset. | |
""" | |
# TODO: Add description of the arguments of the module here | |
_KWARGS_DESCRIPTION = """ | |
Calculates number of elements in dataset | |
Args: | |
data: list of elements. | |
Returns: | |
element_count: number of elements in dataset, | |
Examples: | |
>>> measure = evaluate.load("lvwerra/element_count") | |
>>> measure.compute(["a", "b", "c") | |
{"element_count": 3} | |
""" | |
# TODO: Define external resources urls if needed | |
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt" | |
class ReasoningMetric(evaluate.Metric): | |
"""TODO: Short description of my evaluation module.""" | |
def _info(self): | |
features = datasets.Features( | |
{ | |
"responses": datasets.Value("string"), | |
"references": datasets.Value("string"), | |
} | |
) | |
if self.config_name == "svamp": | |
features = datasets.Features( | |
{ | |
"responses": datasets.Value("string"), | |
"references": datasets.Value("float"), | |
} | |
) | |
# TODO: Specifies the evaluate.EvaluationModuleInfo object | |
return evaluate.EvaluationModuleInfo( | |
# This is the description that will appear on the modules page. | |
# module_type="measurement", | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
# This defines the format of each prediction and reference | |
features=features, | |
# Homepage of the module for documentation | |
homepage="http://module.homepage", | |
# Additional links to the codebase or references | |
codebase_urls=["http://github.com/path/to/codebase/of/new_module"], | |
reference_urls=["http://path.to.reference.url/new_module"], | |
) | |
def _compute(self, responses, references, verbose=False): | |
results = {} | |
scores = getattr(Metrics, self.config_name)(responses, references) | |
acc = np.asarray(scores).mean() | |
results = { | |
"accuracy": acc, | |
"scores": scores, | |
} | |
if verbose: | |
results["references"] = references | |
results["answers"] = responses | |
# results["scores"] = scores | |
return results | |