Spaces:
Sleeping
Sleeping
init
Browse files- .dockerignore +2 -0
- .gitignore +2 -0
- Dockerfile +13 -0
- README.md +1 -0
- main.py +50 -0
- requirements.txt +65 -0
- utils/model.py +116 -0
- utils/redis.py +3 -0
.dockerignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
venv/
|
2 |
+
__pycache__
|
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
/venv
|
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
RUN apt-get update && apt-get install -y redis-server && rm -rf /var/lib/apt/lists/*
|
6 |
+
|
7 |
+
COPY requirements.txt /app
|
8 |
+
|
9 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
10 |
+
|
11 |
+
COPY . /app
|
12 |
+
|
13 |
+
CMD ["/bin/bash", "-c", "redis-server --daemonize yes && streamlit run main.py --server.port=7860 --server.address=0.0.0.0"]
|
README.md
CHANGED
@@ -7,6 +7,7 @@ sdk: docker
|
|
7 |
pinned: false
|
8 |
license: mit
|
9 |
short_description: A simple Web GUI to mask PII information
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
7 |
pinned: false
|
8 |
license: mit
|
9 |
short_description: A simple Web GUI to mask PII information
|
10 |
+
app_port: 7860
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
main.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
from utils.model import pii_masking_pipeline
|
5 |
+
from utils.redis import redis_client
|
6 |
+
|
7 |
+
st.set_page_config(page_title="PII Masking Tool", page_icon="🔒")
|
8 |
+
|
9 |
+
st.title("PII Masking Tool")
|
10 |
+
|
11 |
+
# Text input
|
12 |
+
text_to_mask = st.text_area("Enter text to mask PII:", height=200)
|
13 |
+
|
14 |
+
if st.button("Mask Text"):
|
15 |
+
if text_to_mask:
|
16 |
+
masked_text = pii_masking_pipeline(text_to_mask)
|
17 |
+
|
18 |
+
# Display results
|
19 |
+
st.subheader("Masked Result:")
|
20 |
+
st.write(masked_text)
|
21 |
+
|
22 |
+
redis_client.lpush("masked_texts", masked_text)
|
23 |
+
|
24 |
+
else:
|
25 |
+
st.warning("Please enter some text to mask.")
|
26 |
+
|
27 |
+
|
28 |
+
# Pagination logic
|
29 |
+
ITEMS_PER_PAGE = 10
|
30 |
+
total_items = redis_client.llen("masked_texts")
|
31 |
+
total_pages = math.ceil(total_items / ITEMS_PER_PAGE)
|
32 |
+
|
33 |
+
if total_items > 0:
|
34 |
+
st.subheader("Previous Masked Texts")
|
35 |
+
|
36 |
+
col1, col2, col3 = st.columns([1, 3, 1])
|
37 |
+
with col1:
|
38 |
+
page = st.number_input("Page", min_value=1, max_value=max(1, total_pages), value=1)
|
39 |
+
with col2:
|
40 |
+
st.write(f"Page {page} of {total_pages}")
|
41 |
+
|
42 |
+
start_idx = (page - 1) * ITEMS_PER_PAGE
|
43 |
+
end_idx = start_idx + ITEMS_PER_PAGE - 1
|
44 |
+
|
45 |
+
texts = redis_client.lrange("masked_texts", start_idx, end_idx)
|
46 |
+
for i, txt in enumerate(texts, start=start_idx + 1):
|
47 |
+
display_txt = txt.decode('utf-8')[:30] + "..." if len(txt) > 30 else txt.decode('utf-8')
|
48 |
+
with st.expander(f"#{i}. {display_txt}"):
|
49 |
+
st.write(txt.decode('utf-8'))
|
50 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==5.5.0
|
2 |
+
attrs==25.1.0
|
3 |
+
blinker==1.9.0
|
4 |
+
cachetools==5.5.1
|
5 |
+
certifi==2024.12.14
|
6 |
+
charset-normalizer==3.4.1
|
7 |
+
click==8.1.8
|
8 |
+
colorama==0.4.6
|
9 |
+
coloredlogs==15.0.1
|
10 |
+
filelock==3.17.0
|
11 |
+
flatbuffers==25.1.24
|
12 |
+
fsspec==2024.12.0
|
13 |
+
gitdb==4.0.12
|
14 |
+
GitPython==3.1.44
|
15 |
+
gliner==0.2.16
|
16 |
+
hiredis==3.1.0
|
17 |
+
huggingface-hub==0.27.1
|
18 |
+
humanfriendly==10.0
|
19 |
+
idna==3.10
|
20 |
+
Jinja2==3.1.5
|
21 |
+
jsonschema==4.23.0
|
22 |
+
jsonschema-specifications==2024.10.1
|
23 |
+
markdown-it-py==3.0.0
|
24 |
+
MarkupSafe==3.0.2
|
25 |
+
mdurl==0.1.2
|
26 |
+
mpmath==1.3.0
|
27 |
+
narwhals==1.24.0
|
28 |
+
networkx==3.4.2
|
29 |
+
numpy==2.2.2
|
30 |
+
onnxruntime==1.20.1
|
31 |
+
packaging==24.2
|
32 |
+
pandas==2.2.3
|
33 |
+
pillow==11.1.0
|
34 |
+
protobuf==5.29.3
|
35 |
+
pyarrow==19.0.0
|
36 |
+
pydeck==0.9.1
|
37 |
+
Pygments==2.19.1
|
38 |
+
pyreadline3==3.5.4
|
39 |
+
python-dateutil==2.9.0.post0
|
40 |
+
pytz==2024.2
|
41 |
+
PyYAML==6.0.2
|
42 |
+
redis==5.2.1
|
43 |
+
referencing==0.36.2
|
44 |
+
regex==2024.11.6
|
45 |
+
requests==2.32.3
|
46 |
+
rich==13.9.4
|
47 |
+
rpds-py==0.22.3
|
48 |
+
safetensors==0.5.2
|
49 |
+
sentencepiece==0.2.0
|
50 |
+
setuptools==75.8.0
|
51 |
+
six==1.17.0
|
52 |
+
smmap==5.0.2
|
53 |
+
streamlit==1.41.1
|
54 |
+
sympy==1.13.1
|
55 |
+
tenacity==9.0.0
|
56 |
+
tokenizers==0.21.0
|
57 |
+
toml==0.10.2
|
58 |
+
torch==2.5.1
|
59 |
+
tornado==6.4.2
|
60 |
+
tqdm==4.67.1
|
61 |
+
transformers==4.48.1
|
62 |
+
typing_extensions==4.12.2
|
63 |
+
tzdata==2025.1
|
64 |
+
urllib3==2.3.0
|
65 |
+
watchdog==6.0.0
|
utils/model.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gliner import GLiNER
|
2 |
+
|
3 |
+
print("Loading model and tokenizer...")
|
4 |
+
|
5 |
+
model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")
|
6 |
+
|
7 |
+
print("Model and tokenizer loaded.")
|
8 |
+
|
9 |
+
label_to_mask_map = {
|
10 |
+
"name": "[NAME]",
|
11 |
+
"nric": "[NRIC]",
|
12 |
+
"phone number": "[PHONE]",
|
13 |
+
"address": "[ADDRESS]",
|
14 |
+
"email": "[EMAIL]",
|
15 |
+
"person": "[PERSON]",
|
16 |
+
"organization": "[ORGANIZATION]",
|
17 |
+
"passport number": "[PASSPORT_NUMBER]",
|
18 |
+
"credit card number": "[CREDIT_CARD]",
|
19 |
+
"social security number": "[SSN]",
|
20 |
+
"health insurance id number": "[HEALTH_INS_ID]",
|
21 |
+
"date of birth": "[DOB]",
|
22 |
+
"mobile phone number": "[MOBILE_PHONE]",
|
23 |
+
"bank account number": "[BANK_ACCOUNT]",
|
24 |
+
"medication": "[MEDICATION]",
|
25 |
+
"cpf": "[CPF]",
|
26 |
+
"driver's license number": "[DRIVER_LICENSE]",
|
27 |
+
"tax identification number": "[TAX_ID]",
|
28 |
+
"medical condition": "[MEDICAL_CONDITION]",
|
29 |
+
"identity card number": "[IDENTITY_CARD]",
|
30 |
+
"national id number": "[NATIONAL_ID]",
|
31 |
+
"ip address": "[IP]",
|
32 |
+
"email address": "[EMAIL]",
|
33 |
+
"iban": "[IBAN]",
|
34 |
+
"credit card expiration date": "[CREDIT_CARD_EXP]",
|
35 |
+
"username": "[USERNAME]",
|
36 |
+
"health insurance number": "[HEALTH_INS_NUM]",
|
37 |
+
"registration number": "[REG_NUM]",
|
38 |
+
"student id number": "[STUDENT_ID]",
|
39 |
+
"insurance number": "[INSURANCE_NUM]",
|
40 |
+
"flight number": "[FLIGHT_NUM]",
|
41 |
+
"landline phone number": "[LANDLINE_PHONE]",
|
42 |
+
"blood type": "[BLOOD_TYPE]",
|
43 |
+
"cvv": "[CVV]",
|
44 |
+
"reservation number": "[RESERVATION_NUM]",
|
45 |
+
"digital signature": "[DIGITAL_SIGNATURE]",
|
46 |
+
"social media handle": "[SOCIAL_MEDIA]",
|
47 |
+
"license plate number": "[LICENSE_PLATE]",
|
48 |
+
"cnpj": "[CNPJ]",
|
49 |
+
"postal code": "[POSTAL_CODE]",
|
50 |
+
"passport_number": "[PASSPORT_NUMBER]",
|
51 |
+
"serial number": "[SERIAL_NUM]",
|
52 |
+
"vehicle registration number": "[VEHICLE_REG_NUM]",
|
53 |
+
"credit card brand": "[CREDIT_CARD_BRAND]",
|
54 |
+
"fax number": "[FAX]",
|
55 |
+
"visa number": "[VISA]",
|
56 |
+
"insurance company": "[INSURANCE_COMPANY]",
|
57 |
+
"identity document number": "[IDENTITY_DOCUMENT]",
|
58 |
+
"transaction number": "[TRANSACTION_NUM]",
|
59 |
+
"national health insurance number": "[NATIONAL_HEALTH_INS]",
|
60 |
+
"cvc": "[CVC]",
|
61 |
+
"birth certificate number": "[BIRTH_CERT]",
|
62 |
+
"train ticket number": "[TRAIN_TICKET]",
|
63 |
+
"passport expiration date": "[PASSPORT_EXP_DATE]",
|
64 |
+
"social_security_number": "[SSN]",
|
65 |
+
}
|
66 |
+
|
67 |
+
|
68 |
+
valid_labels = list(label_to_mask_map.keys())
|
69 |
+
|
70 |
+
|
71 |
+
def mask_text(text, entities):
|
72 |
+
"""
|
73 |
+
Masks the original text by replacing entities with corresponding labels.
|
74 |
+
|
75 |
+
Args:
|
76 |
+
text (str): The original text.
|
77 |
+
entities (list): A list of dictionaries where each dictionary contains:
|
78 |
+
- "text": the extracted entity string.
|
79 |
+
- "label": the label for the entity.
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
str: The masked text.
|
83 |
+
"""
|
84 |
+
# Sort entities by their occurrence in text to avoid conflicts
|
85 |
+
entities_sorted = sorted(entities, key=lambda x: text.find(x["text"]), reverse=True)
|
86 |
+
|
87 |
+
# Replace each entity with its corresponding label
|
88 |
+
for entity in entities_sorted:
|
89 |
+
label_placeholder = f"{label_to_mask_map[entity['label']]}"
|
90 |
+
text = text.replace(entity["text"], label_placeholder)
|
91 |
+
|
92 |
+
return text
|
93 |
+
|
94 |
+
|
95 |
+
def pii_masking_pipeline(
|
96 |
+
input_text,
|
97 |
+
labels=("name", "nric", "phone number", "address", "email"),
|
98 |
+
):
|
99 |
+
"""
|
100 |
+
Masks the PII entities in the input text.
|
101 |
+
|
102 |
+
Args:
|
103 |
+
input_text (str): The input text to mask.
|
104 |
+
labels (list): The list of PII entity labels to mask.
|
105 |
+
|
106 |
+
Returns:
|
107 |
+
str: The masked text.
|
108 |
+
"""
|
109 |
+
|
110 |
+
# check that the labels are a subset of valid labels
|
111 |
+
if not set(labels).issubset(valid_labels):
|
112 |
+
raise ValueError("Invalid labels provided.")
|
113 |
+
|
114 |
+
entities = model.predict_entities(input_text, labels)
|
115 |
+
masked_text = mask_text(input_text, entities)
|
116 |
+
return masked_text
|
utils/redis.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import redis
|
2 |
+
|
3 |
+
redis_client = redis.Redis(host="localhost", port=6379)
|