Spaces:

ffeew
/

pii_masker

Sleeping

App Files Files Community

ffeew commited on Jan 27

Commit

cd3f41e

1 Parent(s): 781f636

init

Browse files

Files changed (8) hide show

.dockerignore +2 -0
.gitignore +2 -0
Dockerfile +13 -0
README.md +1 -0
main.py +50 -0
requirements.txt +65 -0
utils/model.py +116 -0
utils/redis.py +3 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ venv/
2	+ __pycache__

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ /venv

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.10-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y redis-server && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt /app
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . /app
+CMD ["/bin/bash", "-c", "redis-server --daemonize yes && streamlit run main.py --server.port=7860 --server.address=0.0.0.0"]

README.md CHANGED Viewed

@@ -7,6 +7,7 @@ sdk: docker
 pinned: false
 license: mit
 short_description: A simple Web GUI to mask PII information
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 license: mit
 short_description: A simple Web GUI to mask PII information
+app_port: 7860
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

main.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import math
+import streamlit as st
+from utils.model import pii_masking_pipeline
+from utils.redis import redis_client
+st.set_page_config(page_title="PII Masking Tool", page_icon="🔒")
+st.title("PII Masking Tool")
+# Text input
+text_to_mask = st.text_area("Enter text to mask PII:", height=200)
+if st.button("Mask Text"):
+    if text_to_mask:
+        masked_text = pii_masking_pipeline(text_to_mask)
+        # Display results
+        st.subheader("Masked Result:")
+        st.write(masked_text)
+        redis_client.lpush("masked_texts", masked_text)
+    else:
+        st.warning("Please enter some text to mask.")
+# Pagination logic
+ITEMS_PER_PAGE = 10
+total_items = redis_client.llen("masked_texts")
+total_pages = math.ceil(total_items / ITEMS_PER_PAGE)
+if total_items > 0:
+    st.subheader("Previous Masked Texts")
+    col1, col2, col3 = st.columns([1, 3, 1])
+    with col1:
+        page = st.number_input("Page", min_value=1, max_value=max(1, total_pages), value=1)
+    with col2:
+        st.write(f"Page {page} of {total_pages}")
+    start_idx = (page - 1) * ITEMS_PER_PAGE
+    end_idx = start_idx + ITEMS_PER_PAGE - 1
+    texts = redis_client.lrange("masked_texts", start_idx, end_idx)
+    for i, txt in enumerate(texts, start=start_idx + 1):
+        display_txt = txt.decode('utf-8')[:30] + "..." if len(txt) > 30 else txt.decode('utf-8')
+        with st.expander(f"#{i}. {display_txt}"):
+            st.write(txt.decode('utf-8'))

requirements.txt ADDED Viewed

	@@ -0,0 +1,65 @@

+altair==5.5.0
+attrs==25.1.0
+blinker==1.9.0
+cachetools==5.5.1
+certifi==2024.12.14
+charset-normalizer==3.4.1
+click==8.1.8
+colorama==0.4.6
+coloredlogs==15.0.1
+filelock==3.17.0
+flatbuffers==25.1.24
+fsspec==2024.12.0
+gitdb==4.0.12
+GitPython==3.1.44
+gliner==0.2.16
+hiredis==3.1.0
+huggingface-hub==0.27.1
+humanfriendly==10.0
+idna==3.10
+Jinja2==3.1.5
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mpmath==1.3.0
+narwhals==1.24.0
+networkx==3.4.2
+numpy==2.2.2
+onnxruntime==1.20.1
+packaging==24.2
+pandas==2.2.3
+pillow==11.1.0
+protobuf==5.29.3
+pyarrow==19.0.0
+pydeck==0.9.1
+Pygments==2.19.1
+pyreadline3==3.5.4
+python-dateutil==2.9.0.post0
+pytz==2024.2
+PyYAML==6.0.2
+redis==5.2.1
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.22.3
+safetensors==0.5.2
+sentencepiece==0.2.0
+setuptools==75.8.0
+six==1.17.0
+smmap==5.0.2
+streamlit==1.41.1
+sympy==1.13.1
+tenacity==9.0.0
+tokenizers==0.21.0
+toml==0.10.2
+torch==2.5.1
+tornado==6.4.2
+tqdm==4.67.1
+transformers==4.48.1
+typing_extensions==4.12.2
+tzdata==2025.1
+urllib3==2.3.0
+watchdog==6.0.0

utils/model.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from gliner import GLiNER
+print("Loading model and tokenizer...")
+model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")
+print("Model and tokenizer loaded.")
+label_to_mask_map = {
+    "name": "[NAME]",
+    "nric": "[NRIC]",
+    "phone number": "[PHONE]",
+    "address": "[ADDRESS]",
+    "email": "[EMAIL]",
+    "person": "[PERSON]",
+    "organization": "[ORGANIZATION]",
+    "passport number": "[PASSPORT_NUMBER]",
+    "credit card number": "[CREDIT_CARD]",
+    "social security number": "[SSN]",
+    "health insurance id number": "[HEALTH_INS_ID]",
+    "date of birth": "[DOB]",
+    "mobile phone number": "[MOBILE_PHONE]",
+    "bank account number": "[BANK_ACCOUNT]",
+    "medication": "[MEDICATION]",
+    "cpf": "[CPF]",
+    "driver's license number": "[DRIVER_LICENSE]",
+    "tax identification number": "[TAX_ID]",
+    "medical condition": "[MEDICAL_CONDITION]",
+    "identity card number": "[IDENTITY_CARD]",
+    "national id number": "[NATIONAL_ID]",
+    "ip address": "[IP]",
+    "email address": "[EMAIL]",
+    "iban": "[IBAN]",
+    "credit card expiration date": "[CREDIT_CARD_EXP]",
+    "username": "[USERNAME]",
+    "health insurance number": "[HEALTH_INS_NUM]",
+    "registration number": "[REG_NUM]",
+    "student id number": "[STUDENT_ID]",
+    "insurance number": "[INSURANCE_NUM]",
+    "flight number": "[FLIGHT_NUM]",
+    "landline phone number": "[LANDLINE_PHONE]",
+    "blood type": "[BLOOD_TYPE]",
+    "cvv": "[CVV]",
+    "reservation number": "[RESERVATION_NUM]",
+    "digital signature": "[DIGITAL_SIGNATURE]",
+    "social media handle": "[SOCIAL_MEDIA]",
+    "license plate number": "[LICENSE_PLATE]",
+    "cnpj": "[CNPJ]",
+    "postal code": "[POSTAL_CODE]",
+    "passport_number": "[PASSPORT_NUMBER]",
+    "serial number": "[SERIAL_NUM]",
+    "vehicle registration number": "[VEHICLE_REG_NUM]",
+    "credit card brand": "[CREDIT_CARD_BRAND]",
+    "fax number": "[FAX]",
+    "visa number": "[VISA]",
+    "insurance company": "[INSURANCE_COMPANY]",
+    "identity document number": "[IDENTITY_DOCUMENT]",
+    "transaction number": "[TRANSACTION_NUM]",
+    "national health insurance number": "[NATIONAL_HEALTH_INS]",
+    "cvc": "[CVC]",
+    "birth certificate number": "[BIRTH_CERT]",
+    "train ticket number": "[TRAIN_TICKET]",
+    "passport expiration date": "[PASSPORT_EXP_DATE]",
+    "social_security_number": "[SSN]",
+}
+valid_labels = list(label_to_mask_map.keys())
+def mask_text(text, entities):
+    """
+    Masks the original text by replacing entities with corresponding labels.
+    Args:
+        text (str): The original text.
+        entities (list): A list of dictionaries where each dictionary contains:
+                         - "text": the extracted entity string.
+                         - "label": the label for the entity.
+    Returns:
+        str: The masked text.
+    """
+    # Sort entities by their occurrence in text to avoid conflicts
+    entities_sorted = sorted(entities, key=lambda x: text.find(x["text"]), reverse=True)
+    # Replace each entity with its corresponding label
+    for entity in entities_sorted:
+        label_placeholder = f"{label_to_mask_map[entity['label']]}"
+        text = text.replace(entity["text"], label_placeholder)
+    return text
+def pii_masking_pipeline(
+    input_text,
+    labels=("name", "nric", "phone number", "address", "email"),
+):
+    """
+    Masks the PII entities in the input text.
+    Args:
+        input_text (str): The input text to mask.
+        labels (list): The list of PII entity labels to mask.
+    Returns:
+        str: The masked text.
+    """
+    # check that the labels are a subset of valid labels
+    if not set(labels).issubset(valid_labels):
+        raise ValueError("Invalid labels provided.")
+    entities = model.predict_entities(input_text, labels)
+    masked_text = mask_text(input_text, entities)
+    return masked_text

utils/redis.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import redis
2	+
3	+ redis_client = redis.Redis(host="localhost", port=6379)