Spaces:
Sleeping
Sleeping
""" | |
Utility for generating mock data functions based on text descriptions. | |
This utility uses a zero-shot classification model to match text descriptions | |
to appropriate Faker functions for generating synthetic data. | |
""" | |
from transformers import pipeline | |
from faker import Faker | |
fake = Faker() | |
# Create a dictionary of Faker functions with descriptive labels | |
faker_functions = { | |
"person name": fake.name, | |
"first name": fake.first_name, | |
"last name": fake.last_name, | |
"email address": fake.email, | |
"phone number": fake.phone_number, | |
"street address": fake.street_address, | |
"street": fake.street_address, | |
"city name": fake.city, | |
"state name": fake.state, | |
"country name": fake.country, | |
"zip code": fake.zipcode, | |
"job title": fake.job, | |
"company name": fake.company, | |
"credit card number": fake.credit_card_number, | |
"date of birth": fake.date_of_birth, | |
"username": fake.user_name, | |
"website url": fake.url, | |
"paragraph text": fake.paragraph, | |
"sentence text": fake.sentence, | |
} | |
def get_highest_score_functions(result, faker_functions, threshold=0.18): | |
""" | |
Process model results and map sequences to appropriate Faker functions. | |
Args: | |
result: The classification results from the model | |
faker_functions: Dictionary mapping labels to Faker functions | |
threshold: Minimum confidence score to assign a function (default: 0.18) | |
Returns: | |
dict: Mapping of input sequences to corresponding Faker functions | |
""" | |
sequence_to_function = {} | |
for item in result: | |
sequence = item["sequence"] | |
label = item["labels"][0] | |
score = item["scores"][0] | |
if score >= threshold: | |
sequence_to_function[sequence] = faker_functions.get(label) | |
else: | |
sequence_to_function[sequence] = None | |
return sequence_to_function | |
def get_functions_for_descriptions(descriptions): | |
""" | |
Get mock data functions based on descriptions or property names. | |
Uses zero-shot classification to match text descriptions to appropriate | |
Faker functions for generating synthetic data. | |
Args: | |
descriptions: Array of descriptions or property names to classify | |
Returns: | |
dict: Mapping of descriptions to corresponding mock data functions | |
""" | |
# Create pipeline with small zero-shot classification model | |
pipe = pipeline(model="MoritzLaurer/DeBERTa-v3-xsmall-mnli-fever-anli-ling-binary") | |
# Call pipeline with descriptions and available Faker function labels | |
result = pipe(descriptions, candidate_labels=list(faker_functions.keys())) | |
# Process results using helper function with default confidence threshold | |
return get_highest_score_functions(result, faker_functions, threshold=0.18) | |