Vedant Mahangade commited on
Commit
c6e6478
·
1 Parent(s): 6a804e1

added model, processing and text highlighting

Browse files
Files changed (1) hide show
  1. app.py +40 -2
app.py CHANGED
@@ -1,4 +1,42 @@
1
  import streamlit as st
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import spacy
3
+ from spacy.tokens import Span
4
+ from transformers import pipeline, BigBirdTokenizerFast, AutoModelForTokenClassification
5
+ # from transformers import PreTrainedTokenizerFast
6
 
7
+
8
+ def pii_app():
9
+ st.title('PII Data Detection')
10
+ text_input = st.text_area('Enter a Paragraph below to get list of PII in your text.')
11
+ tokenizer = BigBirdTokenizerFast.from_pretrained("google/bigbird-roberta-base", block_size=2)
12
+ model = AutoModelForTokenClassification.from_pretrained("vedantM/BigBird-PII")
13
+ big_bird_classifier = pipeline(task="token-classification",
14
+ model=model,
15
+ aggregation_strategy="average",
16
+ tokenizer=tokenizer)
17
+ output = big_bird_classifier(text_input)
18
+ st.header('List of Entities:')
19
+ for entity in output:
20
+ st.write(f"Entity: {entity['word']}, Type: {entity['entity_group']}")
21
+
22
+ highlighted_text = highlight_pii(text_input, output)
23
+ st.header('\nPII Detected Output:')
24
+ st.markdown(highlighted_text, unsafe_allow_html=True)
25
+
26
+
27
+ def highlight_pii(text, entities):
28
+ highlighted_text = text
29
+ offset = 0
30
+ for entity in entities:
31
+ start_idx = entity["start"] + offset
32
+ end_idx = entity["end"] + offset
33
+ highlighted_text = (
34
+ highlighted_text[:start_idx]
35
+ + f'<span style="background-color: blue">{highlighted_text[start_idx:end_idx]}</span>'
36
+ + highlighted_text[end_idx:]
37
+ )
38
+ offset += len('<span style="background-color: blue"></span>') # Adjust offset for HTML tags
39
+ return highlighted_text
40
+
41
+ if __name__ == "__main__":
42
+ pii_app()