ffeew commited on
Commit
cd3f41e
·
1 Parent(s): 781f636
Files changed (8) hide show
  1. .dockerignore +2 -0
  2. .gitignore +2 -0
  3. Dockerfile +13 -0
  4. README.md +1 -0
  5. main.py +50 -0
  6. requirements.txt +65 -0
  7. utils/model.py +116 -0
  8. utils/redis.py +3 -0
.dockerignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ venv/
2
+ __pycache__
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ /venv
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y redis-server && rm -rf /var/lib/apt/lists/*
6
+
7
+ COPY requirements.txt /app
8
+
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ COPY . /app
12
+
13
+ CMD ["/bin/bash", "-c", "redis-server --daemonize yes && streamlit run main.py --server.port=7860 --server.address=0.0.0.0"]
README.md CHANGED
@@ -7,6 +7,7 @@ sdk: docker
7
  pinned: false
8
  license: mit
9
  short_description: A simple Web GUI to mask PII information
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
7
  pinned: false
8
  license: mit
9
  short_description: A simple Web GUI to mask PII information
10
+ app_port: 7860
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
main.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import streamlit as st
3
+
4
+ from utils.model import pii_masking_pipeline
5
+ from utils.redis import redis_client
6
+
7
+ st.set_page_config(page_title="PII Masking Tool", page_icon="🔒")
8
+
9
+ st.title("PII Masking Tool")
10
+
11
+ # Text input
12
+ text_to_mask = st.text_area("Enter text to mask PII:", height=200)
13
+
14
+ if st.button("Mask Text"):
15
+ if text_to_mask:
16
+ masked_text = pii_masking_pipeline(text_to_mask)
17
+
18
+ # Display results
19
+ st.subheader("Masked Result:")
20
+ st.write(masked_text)
21
+
22
+ redis_client.lpush("masked_texts", masked_text)
23
+
24
+ else:
25
+ st.warning("Please enter some text to mask.")
26
+
27
+
28
+ # Pagination logic
29
+ ITEMS_PER_PAGE = 10
30
+ total_items = redis_client.llen("masked_texts")
31
+ total_pages = math.ceil(total_items / ITEMS_PER_PAGE)
32
+
33
+ if total_items > 0:
34
+ st.subheader("Previous Masked Texts")
35
+
36
+ col1, col2, col3 = st.columns([1, 3, 1])
37
+ with col1:
38
+ page = st.number_input("Page", min_value=1, max_value=max(1, total_pages), value=1)
39
+ with col2:
40
+ st.write(f"Page {page} of {total_pages}")
41
+
42
+ start_idx = (page - 1) * ITEMS_PER_PAGE
43
+ end_idx = start_idx + ITEMS_PER_PAGE - 1
44
+
45
+ texts = redis_client.lrange("masked_texts", start_idx, end_idx)
46
+ for i, txt in enumerate(texts, start=start_idx + 1):
47
+ display_txt = txt.decode('utf-8')[:30] + "..." if len(txt) > 30 else txt.decode('utf-8')
48
+ with st.expander(f"#{i}. {display_txt}"):
49
+ st.write(txt.decode('utf-8'))
50
+
requirements.txt ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.5.0
2
+ attrs==25.1.0
3
+ blinker==1.9.0
4
+ cachetools==5.5.1
5
+ certifi==2024.12.14
6
+ charset-normalizer==3.4.1
7
+ click==8.1.8
8
+ colorama==0.4.6
9
+ coloredlogs==15.0.1
10
+ filelock==3.17.0
11
+ flatbuffers==25.1.24
12
+ fsspec==2024.12.0
13
+ gitdb==4.0.12
14
+ GitPython==3.1.44
15
+ gliner==0.2.16
16
+ hiredis==3.1.0
17
+ huggingface-hub==0.27.1
18
+ humanfriendly==10.0
19
+ idna==3.10
20
+ Jinja2==3.1.5
21
+ jsonschema==4.23.0
22
+ jsonschema-specifications==2024.10.1
23
+ markdown-it-py==3.0.0
24
+ MarkupSafe==3.0.2
25
+ mdurl==0.1.2
26
+ mpmath==1.3.0
27
+ narwhals==1.24.0
28
+ networkx==3.4.2
29
+ numpy==2.2.2
30
+ onnxruntime==1.20.1
31
+ packaging==24.2
32
+ pandas==2.2.3
33
+ pillow==11.1.0
34
+ protobuf==5.29.3
35
+ pyarrow==19.0.0
36
+ pydeck==0.9.1
37
+ Pygments==2.19.1
38
+ pyreadline3==3.5.4
39
+ python-dateutil==2.9.0.post0
40
+ pytz==2024.2
41
+ PyYAML==6.0.2
42
+ redis==5.2.1
43
+ referencing==0.36.2
44
+ regex==2024.11.6
45
+ requests==2.32.3
46
+ rich==13.9.4
47
+ rpds-py==0.22.3
48
+ safetensors==0.5.2
49
+ sentencepiece==0.2.0
50
+ setuptools==75.8.0
51
+ six==1.17.0
52
+ smmap==5.0.2
53
+ streamlit==1.41.1
54
+ sympy==1.13.1
55
+ tenacity==9.0.0
56
+ tokenizers==0.21.0
57
+ toml==0.10.2
58
+ torch==2.5.1
59
+ tornado==6.4.2
60
+ tqdm==4.67.1
61
+ transformers==4.48.1
62
+ typing_extensions==4.12.2
63
+ tzdata==2025.1
64
+ urllib3==2.3.0
65
+ watchdog==6.0.0
utils/model.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gliner import GLiNER
2
+
3
+ print("Loading model and tokenizer...")
4
+
5
+ model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")
6
+
7
+ print("Model and tokenizer loaded.")
8
+
9
+ label_to_mask_map = {
10
+ "name": "[NAME]",
11
+ "nric": "[NRIC]",
12
+ "phone number": "[PHONE]",
13
+ "address": "[ADDRESS]",
14
+ "email": "[EMAIL]",
15
+ "person": "[PERSON]",
16
+ "organization": "[ORGANIZATION]",
17
+ "passport number": "[PASSPORT_NUMBER]",
18
+ "credit card number": "[CREDIT_CARD]",
19
+ "social security number": "[SSN]",
20
+ "health insurance id number": "[HEALTH_INS_ID]",
21
+ "date of birth": "[DOB]",
22
+ "mobile phone number": "[MOBILE_PHONE]",
23
+ "bank account number": "[BANK_ACCOUNT]",
24
+ "medication": "[MEDICATION]",
25
+ "cpf": "[CPF]",
26
+ "driver's license number": "[DRIVER_LICENSE]",
27
+ "tax identification number": "[TAX_ID]",
28
+ "medical condition": "[MEDICAL_CONDITION]",
29
+ "identity card number": "[IDENTITY_CARD]",
30
+ "national id number": "[NATIONAL_ID]",
31
+ "ip address": "[IP]",
32
+ "email address": "[EMAIL]",
33
+ "iban": "[IBAN]",
34
+ "credit card expiration date": "[CREDIT_CARD_EXP]",
35
+ "username": "[USERNAME]",
36
+ "health insurance number": "[HEALTH_INS_NUM]",
37
+ "registration number": "[REG_NUM]",
38
+ "student id number": "[STUDENT_ID]",
39
+ "insurance number": "[INSURANCE_NUM]",
40
+ "flight number": "[FLIGHT_NUM]",
41
+ "landline phone number": "[LANDLINE_PHONE]",
42
+ "blood type": "[BLOOD_TYPE]",
43
+ "cvv": "[CVV]",
44
+ "reservation number": "[RESERVATION_NUM]",
45
+ "digital signature": "[DIGITAL_SIGNATURE]",
46
+ "social media handle": "[SOCIAL_MEDIA]",
47
+ "license plate number": "[LICENSE_PLATE]",
48
+ "cnpj": "[CNPJ]",
49
+ "postal code": "[POSTAL_CODE]",
50
+ "passport_number": "[PASSPORT_NUMBER]",
51
+ "serial number": "[SERIAL_NUM]",
52
+ "vehicle registration number": "[VEHICLE_REG_NUM]",
53
+ "credit card brand": "[CREDIT_CARD_BRAND]",
54
+ "fax number": "[FAX]",
55
+ "visa number": "[VISA]",
56
+ "insurance company": "[INSURANCE_COMPANY]",
57
+ "identity document number": "[IDENTITY_DOCUMENT]",
58
+ "transaction number": "[TRANSACTION_NUM]",
59
+ "national health insurance number": "[NATIONAL_HEALTH_INS]",
60
+ "cvc": "[CVC]",
61
+ "birth certificate number": "[BIRTH_CERT]",
62
+ "train ticket number": "[TRAIN_TICKET]",
63
+ "passport expiration date": "[PASSPORT_EXP_DATE]",
64
+ "social_security_number": "[SSN]",
65
+ }
66
+
67
+
68
+ valid_labels = list(label_to_mask_map.keys())
69
+
70
+
71
+ def mask_text(text, entities):
72
+ """
73
+ Masks the original text by replacing entities with corresponding labels.
74
+
75
+ Args:
76
+ text (str): The original text.
77
+ entities (list): A list of dictionaries where each dictionary contains:
78
+ - "text": the extracted entity string.
79
+ - "label": the label for the entity.
80
+
81
+ Returns:
82
+ str: The masked text.
83
+ """
84
+ # Sort entities by their occurrence in text to avoid conflicts
85
+ entities_sorted = sorted(entities, key=lambda x: text.find(x["text"]), reverse=True)
86
+
87
+ # Replace each entity with its corresponding label
88
+ for entity in entities_sorted:
89
+ label_placeholder = f"{label_to_mask_map[entity['label']]}"
90
+ text = text.replace(entity["text"], label_placeholder)
91
+
92
+ return text
93
+
94
+
95
+ def pii_masking_pipeline(
96
+ input_text,
97
+ labels=("name", "nric", "phone number", "address", "email"),
98
+ ):
99
+ """
100
+ Masks the PII entities in the input text.
101
+
102
+ Args:
103
+ input_text (str): The input text to mask.
104
+ labels (list): The list of PII entity labels to mask.
105
+
106
+ Returns:
107
+ str: The masked text.
108
+ """
109
+
110
+ # check that the labels are a subset of valid labels
111
+ if not set(labels).issubset(valid_labels):
112
+ raise ValueError("Invalid labels provided.")
113
+
114
+ entities = model.predict_entities(input_text, labels)
115
+ masked_text = mask_text(input_text, entities)
116
+ return masked_text
utils/redis.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import redis
2
+
3
+ redis_client = redis.Redis(host="localhost", port=6379)