pii_masker / utils /model.py
ffeew's picture
init
cd3f41e
from gliner import GLiNER
print("Loading model and tokenizer...")
model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")
print("Model and tokenizer loaded.")
label_to_mask_map = {
"name": "[NAME]",
"nric": "[NRIC]",
"phone number": "[PHONE]",
"address": "[ADDRESS]",
"email": "[EMAIL]",
"person": "[PERSON]",
"organization": "[ORGANIZATION]",
"passport number": "[PASSPORT_NUMBER]",
"credit card number": "[CREDIT_CARD]",
"social security number": "[SSN]",
"health insurance id number": "[HEALTH_INS_ID]",
"date of birth": "[DOB]",
"mobile phone number": "[MOBILE_PHONE]",
"bank account number": "[BANK_ACCOUNT]",
"medication": "[MEDICATION]",
"cpf": "[CPF]",
"driver's license number": "[DRIVER_LICENSE]",
"tax identification number": "[TAX_ID]",
"medical condition": "[MEDICAL_CONDITION]",
"identity card number": "[IDENTITY_CARD]",
"national id number": "[NATIONAL_ID]",
"ip address": "[IP]",
"email address": "[EMAIL]",
"iban": "[IBAN]",
"credit card expiration date": "[CREDIT_CARD_EXP]",
"username": "[USERNAME]",
"health insurance number": "[HEALTH_INS_NUM]",
"registration number": "[REG_NUM]",
"student id number": "[STUDENT_ID]",
"insurance number": "[INSURANCE_NUM]",
"flight number": "[FLIGHT_NUM]",
"landline phone number": "[LANDLINE_PHONE]",
"blood type": "[BLOOD_TYPE]",
"cvv": "[CVV]",
"reservation number": "[RESERVATION_NUM]",
"digital signature": "[DIGITAL_SIGNATURE]",
"social media handle": "[SOCIAL_MEDIA]",
"license plate number": "[LICENSE_PLATE]",
"cnpj": "[CNPJ]",
"postal code": "[POSTAL_CODE]",
"passport_number": "[PASSPORT_NUMBER]",
"serial number": "[SERIAL_NUM]",
"vehicle registration number": "[VEHICLE_REG_NUM]",
"credit card brand": "[CREDIT_CARD_BRAND]",
"fax number": "[FAX]",
"visa number": "[VISA]",
"insurance company": "[INSURANCE_COMPANY]",
"identity document number": "[IDENTITY_DOCUMENT]",
"transaction number": "[TRANSACTION_NUM]",
"national health insurance number": "[NATIONAL_HEALTH_INS]",
"cvc": "[CVC]",
"birth certificate number": "[BIRTH_CERT]",
"train ticket number": "[TRAIN_TICKET]",
"passport expiration date": "[PASSPORT_EXP_DATE]",
"social_security_number": "[SSN]",
}
valid_labels = list(label_to_mask_map.keys())
def mask_text(text, entities):
"""
Masks the original text by replacing entities with corresponding labels.
Args:
text (str): The original text.
entities (list): A list of dictionaries where each dictionary contains:
- "text": the extracted entity string.
- "label": the label for the entity.
Returns:
str: The masked text.
"""
# Sort entities by their occurrence in text to avoid conflicts
entities_sorted = sorted(entities, key=lambda x: text.find(x["text"]), reverse=True)
# Replace each entity with its corresponding label
for entity in entities_sorted:
label_placeholder = f"{label_to_mask_map[entity['label']]}"
text = text.replace(entity["text"], label_placeholder)
return text
def pii_masking_pipeline(
input_text,
labels=("name", "nric", "phone number", "address", "email"),
):
"""
Masks the PII entities in the input text.
Args:
input_text (str): The input text to mask.
labels (list): The list of PII entity labels to mask.
Returns:
str: The masked text.
"""
# check that the labels are a subset of valid labels
if not set(labels).issubset(valid_labels):
raise ValueError("Invalid labels provided.")
entities = model.predict_entities(input_text, labels)
masked_text = mask_text(input_text, entities)
return masked_text