Spaces:

vedantM
/

PII-Data-Detection

Running

App Files Files Community

Vedant Mahangade commited on May 3, 2024

Commit

c6e6478

1 Parent(s): 6a804e1

added model, processing and text highlighting

Browse files

Files changed (1) hide show

app.py +40 -2

app.py CHANGED Viewed

@@ -1,4 +1,42 @@
 import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

 import streamlit as st
+import spacy
+from spacy.tokens import Span
+from transformers import pipeline, BigBirdTokenizerFast, AutoModelForTokenClassification
+# from transformers import PreTrainedTokenizerFast
+def pii_app():
+    st.title('PII Data Detection')
+    text_input = st.text_area('Enter a Paragraph below to get list of PII in your text.')
+    tokenizer = BigBirdTokenizerFast.from_pretrained("google/bigbird-roberta-base", block_size=2)
+    model = AutoModelForTokenClassification.from_pretrained("vedantM/BigBird-PII")
+    big_bird_classifier = pipeline(task="token-classification",
+                                   model=model,
+                                   aggregation_strategy="average",
+                                   tokenizer=tokenizer)
+    output = big_bird_classifier(text_input)
+    st.header('List of Entities:')
+    for entity in output:
+        st.write(f"Entity: {entity['word']}, Type: {entity['entity_group']}")
+    highlighted_text = highlight_pii(text_input, output)
+    st.header('\nPII Detected Output:')
+    st.markdown(highlighted_text, unsafe_allow_html=True)
+def highlight_pii(text, entities):
+    highlighted_text = text
+    offset = 0
+    for entity in entities:
+        start_idx = entity["start"] + offset
+        end_idx = entity["end"] + offset
+        highlighted_text = (
+                highlighted_text[:start_idx]
+                + f'<span style="background-color: blue">{highlighted_text[start_idx:end_idx]}</span>'
+                + highlighted_text[end_idx:]
+        )
+        offset += len('<span style="background-color: blue"></span>')  # Adjust offset for HTML tags
+    return highlighted_text
+if __name__ == "__main__":
+    pii_app()