Spaces:

edodso2
/

mock-json-generator

Sleeping

App Files Files Community

mock-json-generator / utils /classify.py

edodso2

Fix incorrect comment

8a0b401 about 2 months ago

raw

history blame contribute delete

2.78 kB

	"""
	Utility for generating mock data functions based on text descriptions.

	This utility uses a zero-shot classification model to match text descriptions
	to appropriate Faker functions for generating synthetic data.
	"""

	from transformers import pipeline
	from faker import Faker

	fake = Faker()

	# Create a dictionary of Faker functions with descriptive labels
	faker_functions = {
	"person name": fake.name,
	"first name": fake.first_name,
	"last name": fake.last_name,
	"email address": fake.email,
	"phone number": fake.phone_number,
	"street address": fake.street_address,
	"street": fake.street_address,
	"city name": fake.city,
	"state name": fake.state,
	"country name": fake.country,
	"zip code": fake.zipcode,
	"job title": fake.job,
	"company name": fake.company,
	"credit card number": fake.credit_card_number,
	"date of birth": fake.date_of_birth,
	"username": fake.user_name,
	"website url": fake.url,
	"paragraph text": fake.paragraph,
	"sentence text": fake.sentence,
	}


	def get_highest_score_functions(result, faker_functions, threshold=0.18):
	"""
	Process model results and map sequences to appropriate Faker functions.

	Args:
	result: The classification results from the model
	faker_functions: Dictionary mapping labels to Faker functions
	threshold: Minimum confidence score to assign a function (default: 0.18)

	Returns:
	dict: Mapping of input sequences to corresponding Faker functions
	"""
	sequence_to_function = {}
	for item in result:
	sequence = item["sequence"]
	label = item["labels"][0]
	score = item["scores"][0]
	if score >= threshold:
	sequence_to_function[sequence] = faker_functions.get(label)
	else:
	sequence_to_function[sequence] = None
	return sequence_to_function


	def get_functions_for_descriptions(descriptions):
	"""
	Get mock data functions based on descriptions or property names.

	Uses zero-shot classification to match text descriptions to appropriate
	Faker functions for generating synthetic data.

	Args:
	descriptions: Array of descriptions or property names to classify

	Returns:
	dict: Mapping of descriptions to corresponding mock data functions
	"""
	# Create pipeline with small zero-shot classification model
	pipe = pipeline(model="MoritzLaurer/DeBERTa-v3-xsmall-mnli-fever-anli-ling-binary")

	# Call pipeline with descriptions and available Faker function labels
	result = pipe(descriptions, candidate_labels=list(faker_functions.keys()))

	# Process results using helper function with default confidence threshold
	return get_highest_score_functions(result, faker_functions, threshold=0.18)