from gliner import GLiNER print("Loading model and tokenizer...") model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1") print("Model and tokenizer loaded.") label_to_mask_map = { "name": "[NAME]", "nric": "[NRIC]", "phone number": "[PHONE]", "address": "[ADDRESS]", "email": "[EMAIL]", "person": "[PERSON]", "organization": "[ORGANIZATION]", "passport number": "[PASSPORT_NUMBER]", "credit card number": "[CREDIT_CARD]", "social security number": "[SSN]", "health insurance id number": "[HEALTH_INS_ID]", "date of birth": "[DOB]", "mobile phone number": "[MOBILE_PHONE]", "bank account number": "[BANK_ACCOUNT]", "medication": "[MEDICATION]", "cpf": "[CPF]", "driver's license number": "[DRIVER_LICENSE]", "tax identification number": "[TAX_ID]", "medical condition": "[MEDICAL_CONDITION]", "identity card number": "[IDENTITY_CARD]", "national id number": "[NATIONAL_ID]", "ip address": "[IP]", "email address": "[EMAIL]", "iban": "[IBAN]", "credit card expiration date": "[CREDIT_CARD_EXP]", "username": "[USERNAME]", "health insurance number": "[HEALTH_INS_NUM]", "registration number": "[REG_NUM]", "student id number": "[STUDENT_ID]", "insurance number": "[INSURANCE_NUM]", "flight number": "[FLIGHT_NUM]", "landline phone number": "[LANDLINE_PHONE]", "blood type": "[BLOOD_TYPE]", "cvv": "[CVV]", "reservation number": "[RESERVATION_NUM]", "digital signature": "[DIGITAL_SIGNATURE]", "social media handle": "[SOCIAL_MEDIA]", "license plate number": "[LICENSE_PLATE]", "cnpj": "[CNPJ]", "postal code": "[POSTAL_CODE]", "passport_number": "[PASSPORT_NUMBER]", "serial number": "[SERIAL_NUM]", "vehicle registration number": "[VEHICLE_REG_NUM]", "credit card brand": "[CREDIT_CARD_BRAND]", "fax number": "[FAX]", "visa number": "[VISA]", "insurance company": "[INSURANCE_COMPANY]", "identity document number": "[IDENTITY_DOCUMENT]", "transaction number": "[TRANSACTION_NUM]", "national health insurance number": "[NATIONAL_HEALTH_INS]", "cvc": "[CVC]", "birth certificate number": "[BIRTH_CERT]", "train ticket number": "[TRAIN_TICKET]", "passport expiration date": "[PASSPORT_EXP_DATE]", "social_security_number": "[SSN]", } valid_labels = list(label_to_mask_map.keys()) def mask_text(text, entities): """ Masks the original text by replacing entities with corresponding labels. Args: text (str): The original text. entities (list): A list of dictionaries where each dictionary contains: - "text": the extracted entity string. - "label": the label for the entity. Returns: str: The masked text. """ # Sort entities by their occurrence in text to avoid conflicts entities_sorted = sorted(entities, key=lambda x: text.find(x["text"]), reverse=True) # Replace each entity with its corresponding label for entity in entities_sorted: label_placeholder = f"{label_to_mask_map[entity['label']]}" text = text.replace(entity["text"], label_placeholder) return text def pii_masking_pipeline( input_text, labels=("name", "nric", "phone number", "address", "email"), ): """ Masks the PII entities in the input text. Args: input_text (str): The input text to mask. labels (list): The list of PII entity labels to mask. Returns: str: The masked text. """ # check that the labels are a subset of valid labels if not set(labels).issubset(valid_labels): raise ValueError("Invalid labels provided.") entities = model.predict_entities(input_text, labels) masked_text = mask_text(input_text, entities) return masked_text