Spaces:
Sleeping
Sleeping
from gliner import GLiNER | |
print("Loading model and tokenizer...") | |
model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1") | |
print("Model and tokenizer loaded.") | |
label_to_mask_map = { | |
"name": "[NAME]", | |
"nric": "[NRIC]", | |
"phone number": "[PHONE]", | |
"address": "[ADDRESS]", | |
"email": "[EMAIL]", | |
"person": "[PERSON]", | |
"organization": "[ORGANIZATION]", | |
"passport number": "[PASSPORT_NUMBER]", | |
"credit card number": "[CREDIT_CARD]", | |
"social security number": "[SSN]", | |
"health insurance id number": "[HEALTH_INS_ID]", | |
"date of birth": "[DOB]", | |
"mobile phone number": "[MOBILE_PHONE]", | |
"bank account number": "[BANK_ACCOUNT]", | |
"medication": "[MEDICATION]", | |
"cpf": "[CPF]", | |
"driver's license number": "[DRIVER_LICENSE]", | |
"tax identification number": "[TAX_ID]", | |
"medical condition": "[MEDICAL_CONDITION]", | |
"identity card number": "[IDENTITY_CARD]", | |
"national id number": "[NATIONAL_ID]", | |
"ip address": "[IP]", | |
"email address": "[EMAIL]", | |
"iban": "[IBAN]", | |
"credit card expiration date": "[CREDIT_CARD_EXP]", | |
"username": "[USERNAME]", | |
"health insurance number": "[HEALTH_INS_NUM]", | |
"registration number": "[REG_NUM]", | |
"student id number": "[STUDENT_ID]", | |
"insurance number": "[INSURANCE_NUM]", | |
"flight number": "[FLIGHT_NUM]", | |
"landline phone number": "[LANDLINE_PHONE]", | |
"blood type": "[BLOOD_TYPE]", | |
"cvv": "[CVV]", | |
"reservation number": "[RESERVATION_NUM]", | |
"digital signature": "[DIGITAL_SIGNATURE]", | |
"social media handle": "[SOCIAL_MEDIA]", | |
"license plate number": "[LICENSE_PLATE]", | |
"cnpj": "[CNPJ]", | |
"postal code": "[POSTAL_CODE]", | |
"passport_number": "[PASSPORT_NUMBER]", | |
"serial number": "[SERIAL_NUM]", | |
"vehicle registration number": "[VEHICLE_REG_NUM]", | |
"credit card brand": "[CREDIT_CARD_BRAND]", | |
"fax number": "[FAX]", | |
"visa number": "[VISA]", | |
"insurance company": "[INSURANCE_COMPANY]", | |
"identity document number": "[IDENTITY_DOCUMENT]", | |
"transaction number": "[TRANSACTION_NUM]", | |
"national health insurance number": "[NATIONAL_HEALTH_INS]", | |
"cvc": "[CVC]", | |
"birth certificate number": "[BIRTH_CERT]", | |
"train ticket number": "[TRAIN_TICKET]", | |
"passport expiration date": "[PASSPORT_EXP_DATE]", | |
"social_security_number": "[SSN]", | |
} | |
valid_labels = list(label_to_mask_map.keys()) | |
def mask_text(text, entities): | |
""" | |
Masks the original text by replacing entities with corresponding labels. | |
Args: | |
text (str): The original text. | |
entities (list): A list of dictionaries where each dictionary contains: | |
- "text": the extracted entity string. | |
- "label": the label for the entity. | |
Returns: | |
str: The masked text. | |
""" | |
# Sort entities by their occurrence in text to avoid conflicts | |
entities_sorted = sorted(entities, key=lambda x: text.find(x["text"]), reverse=True) | |
# Replace each entity with its corresponding label | |
for entity in entities_sorted: | |
label_placeholder = f"{label_to_mask_map[entity['label']]}" | |
text = text.replace(entity["text"], label_placeholder) | |
return text | |
def pii_masking_pipeline( | |
input_text, | |
labels=("name", "nric", "phone number", "address", "email"), | |
): | |
""" | |
Masks the PII entities in the input text. | |
Args: | |
input_text (str): The input text to mask. | |
labels (list): The list of PII entity labels to mask. | |
Returns: | |
str: The masked text. | |
""" | |
# check that the labels are a subset of valid labels | |
if not set(labels).issubset(valid_labels): | |
raise ValueError("Invalid labels provided.") | |
entities = model.predict_entities(input_text, labels) | |
masked_text = mask_text(input_text, entities) | |
return masked_text | |