""" Utility for generating mock data functions based on text descriptions. This utility uses a zero-shot classification model to match text descriptions to appropriate Faker functions for generating synthetic data. """ from transformers import pipeline from faker import Faker fake = Faker() # Create a dictionary of Faker functions with descriptive labels faker_functions = { "person name": fake.name, "first name": fake.first_name, "last name": fake.last_name, "email address": fake.email, "phone number": fake.phone_number, "street address": fake.street_address, "street": fake.street_address, "city name": fake.city, "state name": fake.state, "country name": fake.country, "zip code": fake.zipcode, "job title": fake.job, "company name": fake.company, "credit card number": fake.credit_card_number, "date of birth": fake.date_of_birth, "username": fake.user_name, "website url": fake.url, "paragraph text": fake.paragraph, "sentence text": fake.sentence, } def get_highest_score_functions(result, faker_functions, threshold=0.18): """ Process model results and map sequences to appropriate Faker functions. Args: result: The classification results from the model faker_functions: Dictionary mapping labels to Faker functions threshold: Minimum confidence score to assign a function (default: 0.18) Returns: dict: Mapping of input sequences to corresponding Faker functions """ sequence_to_function = {} for item in result: sequence = item["sequence"] label = item["labels"][0] score = item["scores"][0] if score >= threshold: sequence_to_function[sequence] = faker_functions.get(label) else: sequence_to_function[sequence] = None return sequence_to_function def get_functions_for_descriptions(descriptions): """ Get mock data functions based on descriptions or property names. Uses zero-shot classification to match text descriptions to appropriate Faker functions for generating synthetic data. Args: descriptions: Array of descriptions or property names to classify Returns: dict: Mapping of descriptions to corresponding mock data functions """ # Create pipeline with small zero-shot classification model pipe = pipeline(model="MoritzLaurer/DeBERTa-v3-xsmall-mnli-fever-anli-ling-binary") # Call pipeline with descriptions and available Faker function labels result = pipe(descriptions, candidate_labels=list(faker_functions.keys())) # Process results using helper function with default confidence threshold return get_highest_score_functions(result, faker_functions, threshold=0.18)