edodso2's picture
Fix incorrect comment
8a0b401
"""
Utility for generating mock data functions based on text descriptions.
This utility uses a zero-shot classification model to match text descriptions
to appropriate Faker functions for generating synthetic data.
"""
from transformers import pipeline
from faker import Faker
fake = Faker()
# Create a dictionary of Faker functions with descriptive labels
faker_functions = {
"person name": fake.name,
"first name": fake.first_name,
"last name": fake.last_name,
"email address": fake.email,
"phone number": fake.phone_number,
"street address": fake.street_address,
"street": fake.street_address,
"city name": fake.city,
"state name": fake.state,
"country name": fake.country,
"zip code": fake.zipcode,
"job title": fake.job,
"company name": fake.company,
"credit card number": fake.credit_card_number,
"date of birth": fake.date_of_birth,
"username": fake.user_name,
"website url": fake.url,
"paragraph text": fake.paragraph,
"sentence text": fake.sentence,
}
def get_highest_score_functions(result, faker_functions, threshold=0.18):
"""
Process model results and map sequences to appropriate Faker functions.
Args:
result: The classification results from the model
faker_functions: Dictionary mapping labels to Faker functions
threshold: Minimum confidence score to assign a function (default: 0.18)
Returns:
dict: Mapping of input sequences to corresponding Faker functions
"""
sequence_to_function = {}
for item in result:
sequence = item["sequence"]
label = item["labels"][0]
score = item["scores"][0]
if score >= threshold:
sequence_to_function[sequence] = faker_functions.get(label)
else:
sequence_to_function[sequence] = None
return sequence_to_function
def get_functions_for_descriptions(descriptions):
"""
Get mock data functions based on descriptions or property names.
Uses zero-shot classification to match text descriptions to appropriate
Faker functions for generating synthetic data.
Args:
descriptions: Array of descriptions or property names to classify
Returns:
dict: Mapping of descriptions to corresponding mock data functions
"""
# Create pipeline with small zero-shot classification model
pipe = pipeline(model="MoritzLaurer/DeBERTa-v3-xsmall-mnli-fever-anli-ling-binary")
# Call pipeline with descriptions and available Faker function labels
result = pipe(descriptions, candidate_labels=list(faker_functions.keys()))
# Process results using helper function with default confidence threshold
return get_highest_score_functions(result, faker_functions, threshold=0.18)