Spaces:

argilla
/

synthetic-data-generator

Running

File size: 7,153 Bytes

34371d3

from typing import List

from datasets import get_dataset_config_names, get_dataset_split_names
from distilabel.llms import InferenceEndpointsLLM
from distilabel.steps.tasks import (
    UltraFeedback,
    TextGeneration,
)

from src.distilabel_dataset_generator.pipelines.base import (
    MODEL,
    _get_next_api_key,
)
from src.distilabel_dataset_generator.utils import extract_column_names


def get_ultrafeedback_evaluator(aspect, is_sample):
    ultrafeedback_evaluator = UltraFeedback(
        llm=InferenceEndpointsLLM(
            model_id=MODEL,
            tokenizer_id=MODEL,
            api_key=_get_next_api_key(),
            generation_kwargs={
                "temperature": 0.7,
                "max_new_tokens": 256 if is_sample else 2048,
            },
        ),
        aspect=aspect,
    )
    ultrafeedback_evaluator.load()
    return ultrafeedback_evaluator


def get_custom_evaluator(prompt_template, structured_output, columns, is_sample):
    custom_evaluator = TextGeneration(
        llm=InferenceEndpointsLLM(
            model_id=MODEL,
            tokenizer_id=MODEL,
            api_key=_get_next_api_key(),
            structured_output={"format": "json", "schema": structured_output},
            generation_kwargs={
                "temperature": 0.7,
                "max_new_tokens": 256 if is_sample else 2048,
            },
        ),
        template=prompt_template,
        columns=columns
    )
    custom_evaluator.load()
    return custom_evaluator


def generate_ultrafeedback_pipeline_code(
    repo_id, subset, split, aspects, instruction_column, response_columns, num_rows
):
    if len(aspects) == 1:
        code = f"""
# Requirements: `pip install distilabel[hf-inference-endpoints]`
import os
from datasets import load_dataset
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromDicts
from distilabel.steps.tasks import UltraFeedback
from distilabel.llms import InferenceEndpointsLLM

MODEL = "{MODEL}"
os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained

hf_ds = load_dataset("{repo_id}", "{subset}", split="{split}[:{num_rows}]")
data = preprocess_data(hf_ds, "{instruction_column}", "{response_columns}") # to get a list of dictionaries

with Pipeline(name="ultrafeedback") as pipeline:

    load_the_dataset = LoadDataFromDicts(
        data = data,
    )

    ultrafeedback_evaluator = UltraFeedback(
        llm=InferenceEndpointsLLM(
            model_id=MODEL,
            tokenizer_id=MODEL,
            api_key=os.environ["HF_TOKEN"],
            generation_kwargs={{
                "temperature": 0.7,
                "max_new_tokens": 2048,
            }},
        ),
        aspect=aspect,
    )
    
    load_the_dataset >> ultrafeedback_evaluator

if __name__ == "__main__":
    distiset = pipeline.run()
"""
    else:
        code = f"""
# Requirements: `pip install distilabel[hf-inference-endpoints]`
import os
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromDicts, CombineOutputs
from distilabel.steps.tasks import UltraFeedback
from distilabel.llms import InferenceEndpointsLLM

MODEL = "{MODEL}"
os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained

hf_ds = load_dataset("{repo_id}", "{subset}", split="{split}")
data = preprocess_data(hf_ds, "{instruction_column}", "{response_columns}") # to get a list of dictionaries

with Pipeline(name="ultrafeedback") as pipeline:

    load_the_dataset = LoadDataFromDicts(
        data = data,
    )
    
    tasks = []
    for aspect in aspects:
        evaluate_responses = UltraFeedback(
            name=f"evaluate-responses-{{aspect}}",
            aspect=aspect,
            llm=InferenceEndpointsLLM(
                model_id=MODEL,
                tokenizer_id=MODEL,
                api_key=os.environ["HF_TOKEN"],
                generation_kwargs={{
                    "temperature": 0.7,
                    "max_new_tokens": 2048,
                }},
            output_mappings={{
                "ratings": f"ratings_{{aspect}}",
                "types": f"type_{{aspect}}",
                "rationales": f"rationales_for_types_{{aspect}}",
                "rationales-for-ratings": f"rationales_for_ratings_{{aspect}}",
            }} if aspect in ["truthfulness", "helpfulness"] else {{"rationales": f"rationales_{{aspect}}", "ratings": f"ratings_{{aspect}}"}},
        )
        tasks.append(evaluate_responses)
    
    combine_outputs = CombineOutputs()
    
    load_the_dataset >> tasks >> combine_outputs

if __name__ == "__main__":
    distiset = pipeline.run()
"""
    return code


def generate_custom_pipeline_code(
    repo_id, subset, split, prompt_template, structured_output, num_rows
):
    columns = extract_column_names(structured_output)
    code = f"""
# Requirements: `pip install distilabel[hf-inference-endpoints, instructor]`
import os
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromHub
from distilabel.steps.tasks import TextGeneration
from distilabel.llms import InferenceEndpointsLLM

MODEL = "{MODEL}"
CUSTOM_TEMPLATE = "{prompt_template}"
os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained

with Pipeline(name="custom-evaluation") as pipeline:
    load_the_dataset = LoadDataFromHub(
        repo_id="{repo_id}",
        config="{subset}",
        split="{split}",
        num_examples={num_rows},
        batch_size=2
    )
    custom_evaluator = TextGeneration(
        llm=InferenceEndpointsLLM(
            model_id=MODEL,
            tokenizer_id=MODEL,
            api_key=os.environ["HF_TOKEN"],
            structured_output={{"format": "json", "schema": {structured_output}}},
            generation_kwargs={{
                "temperature": 0.7,
                "max_new_tokens": 2048,
            }},
        ),
        template=CUSTOM_TEMPLATE,
        columns={columns}
    )
    
    load_the_dataset >> custom_evaluator

if __name__ == "__main__":
    distiset = pipeline.run()
"""
    return code


def generate_pipeline_code(repo_id, aspects, instruction_column, response_columns, prompt_template, structured_output, num_rows, eval_type):
    if repo_id is None:
        subset = "default"
        split = "train"
    else:
        subset = get_dataset_config_names(repo_id)[0]
        split = get_dataset_split_names(repo_id, subset)[0]
    if eval_type == "ultrafeedback":
        return generate_ultrafeedback_pipeline_code(repo_id, subset, split, aspects, instruction_column, response_columns, num_rows)
    return generate_custom_pipeline_code(repo_id, subset, split, prompt_template, structured_output, num_rows)