tlem / tlem.py
facat's picture
upd
8af54b8
raw
history blame
6.68 kB
# %%
try:
from ipytorch import logging
except Exception as e:
import logging
from typing import Any, Optional, Protocol, Iterable, Callable
# %%
# %cd ../tlem
# %load_ext ipytorch
# %ls
from utils import (
NUMERIC_IN_ZH,
extract_choice_ans,
extract_numeric,
get_answer,
is_equiv,
)
from dataclasses import dataclass, field
from datasets import load_dataset, Dataset
from functools import cached_property
TextGenerationPipeline = Callable[[Iterable[str]], list[str]]
from evaluate import EvaluationModule, Evaluator, evaluator, load
@dataclass
class Task:
dataset_name: str = "gsm8k"
dataset_params: dict = field(default_factory=dict)
# metrics: list[str] = field(default_factory=list)
metric_name: str | tuple[str, str] = "gsm8k"
input_column: str = "question"
label_column: str
prompt: Optional[Callable | str] = None
@cached_property
def samples(self):
return self.dataset[self.input_column]
@cached_property
def dataset(self):
ds = load_dataset(self.dataset_name, **self.dataset_params)
if self.prompt is not None:
ds = ds.map(
lambda example: {
self.input_column: self.prompt.format(
input_column=example[self.input_column]
)
}
if isinstance(self.prompt, str)
else self.prompt(example),
)
return ds
@cached_property
def metric(self):
metric = (
load(self.metric_name)
if isinstance(self.metric_name, str)
else load(*self.metric_name)
)
return metric
def run(self, pipeline: TextGenerationPipeline):
outputs = pipeline(self.samples)
return self.metric.compute(outputs, self.dataset[self.label_column])
class Metrics:
def gsm8k(responses: list[str], answers: list[str | int]):
scores = []
for response, answer in zip(responses, answers):
pred = extract_numeric(response)
gold = extract_numeric(answer) if isinstance(answer, str) else str(answer)
scores.append(1.0 * (pred == gold))
return scores
def MATH(responses: list[str], answers: list[str]):
scores = []
for response, answer in zip(responses, answers):
indices = [pos for pos, char in enumerate(response) if char == "$"]
if len(indices) <= 2:
scores.append(0)
continue
else:
result = response[indices[-2] + 1 : indices[-1]]
gold = get_answer(answer)
scores.append(1.0 * is_equiv(result, gold))
return scores
def math23k(responses: list[str], answers: list[str]):
scores = []
for response, answer in zip(responses, answers):
pred = extract_numeric(response, pattern=NUMERIC_IN_ZH)
gold = extract_numeric(answer, pattern=NUMERIC_IN_ZH)
scores.append(1.0 * (pred == gold))
return scores
def gsm8k_zh(responses: list[str], answers: list[str]):
scores = []
for response, answer in zip(responses, answers):
pred = extract_numeric(response, pattern=NUMERIC_IN_ZH)
gold = extract_numeric(answer)
scores.append(1.0 * (pred == gold))
return scores
def svamp(responses: list[float], answers: list[str]):
scores = []
for response, answer in zip(responses, answers):
pred = extract_numeric(response, pattern=NUMERIC_IN_ZH)
gold = answer
scores.append(1.0 * (float(pred) == gold))
return scores
def mmlu(responses, answers):
scores = []
for response, answer in zip(responses, answers):
pred = extract_choice_ans(response)
gold = answer.lower()
scores.append(1.0 * (pred == gold))
return scores
import evaluate
import numpy as np
import datasets
# TODO: Add BibTeX citation
_CITATION = """\
@InProceedings{huggingface:module,
title = {A great new module},
authors={huggingface, Inc.},
year={2020}
}
"""
# TODO: Add description of the module here
_DESCRIPTION = """\
A simple measurement that returns the number of elements in dataset.
"""
# TODO: Add description of the arguments of the module here
_KWARGS_DESCRIPTION = """
Calculates number of elements in dataset
Args:
data: list of elements.
Returns:
element_count: number of elements in dataset,
Examples:
>>> measure = evaluate.load("lvwerra/element_count")
>>> measure.compute(["a", "b", "c")
{"element_count": 3}
"""
# TODO: Define external resources urls if needed
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class ReasoningMetric(evaluate.Metric):
"""TODO: Short description of my evaluation module."""
def _info(self):
features = datasets.Features(
{
"responses": datasets.Value("string"),
"references": datasets.Value("string"),
}
)
if self.config_name == "svamp":
features = datasets.Features(
{
"responses": datasets.Value("string"),
"references": datasets.Value("float"),
}
)
# TODO: Specifies the evaluate.EvaluationModuleInfo object
return evaluate.EvaluationModuleInfo(
# This is the description that will appear on the modules page.
# module_type="measurement",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
# This defines the format of each prediction and reference
features=features,
# Homepage of the module for documentation
homepage="http://module.homepage",
# Additional links to the codebase or references
codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
reference_urls=["http://path.to.reference.url/new_module"],
)
def _compute(self, responses, references, verbose=False):
results = {}
scores = getattr(Metrics, self.config_name)(responses, references)
acc = np.asarray(scores).mean()
results = {
"accuracy": acc,
"scores": scores,
}
if verbose:
results["references"] = references
results["answers"] = responses
# results["scores"] = scores
return results