Spaces:

langdonholmes
/

piilo

Sleeping

File size: 4,995 Bytes

e9abb72
5c59636
e9abb72
0c29fae
e9abb72
0c29fae
 
 
f0664d7
0c29fae
 
9637704
 
0c29fae
5c59636
e9abb72
 
 
 
 
5c59636
e9abb72
 
5c59636
 
 
e9abb72
 
9637704
e9abb72
 
 
3ad7899
9637704
e9abb72
287a33f
e9abb72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c59636
e9abb72
 
e7e2ae9
 
e9abb72
0c29fae
5c59636
e9abb72
 
 
5c59636
287a33f
 
e9abb72
 
 
5c59636
e9abb72
 
 
5c59636
e9abb72
 
0c29fae
e9abb72
 
 
 
5c59636
e9abb72
 
 
 
 
5c59636
 
e9abb72
 
 
5c59636
e9abb72
 
 
 
 
5c59636
 
e9abb72
287a33f
e9abb72
 
5c59636
e9abb72
 
 
 
 
 
e3f8caf
e9abb72
5c59636
e9abb72
5c59636
 
e9abb72
3ad7899
287a33f
 
e9abb72
287a33f
e9abb72
5c59636
e9abb72
 
 
 
 
5c59636
e9abb72
5c59636
 
 
 
e9abb72
 
 
 
 
 
5c59636
e9abb72
 
 
 
 
5c59636
e9abb72
 
5c59636
e9abb72


'''Streamlit app for Student Name Detection models.'''
import json
import os
import warnings
from json import JSONEncoder

import pandas as pd
import streamlit as st
from annotated_text import annotated_text

from piilo.engines.analyzer import CustomAnalyzer
from piilo.engines.anonymizer import SurrogateAnonymizer

os.environ['TOKENIZERS_PARALLELISM'] = 'false'
warnings.filterwarnings('ignore')

# Helper methods
@st.cache(allow_output_mutation=True)
def analyzer_engine():
    '''Return AnalyzerEngine and cache with Streamlit.'''

    configuration = {
        'nlp_engine_name': 'spacy',
        'models': [
            {'lang_code': 'en', 'model_name': 'en_student_name_detector'}],
    }

    return CustomAnalyzer(configuration=configuration)

@st.cache(allow_output_mutation=True)
def anonymizer_engine():
    '''Return generate surrogate anonymizer.'''
    return SurrogateAnonymizer()

def annotate(text, st_analyze_results, st_entities):
    tokens = []
    # sort by start index
    results = sorted(st_analyze_results, key=lambda x: x.start)
    for i, res in enumerate(results):
        if i == 0:
            tokens.append(text[:res.start])

        # append entity text and entity type
        tokens.append((text[res.start: res.end], res.entity_type))

        # if another entity coming i.e. we're not at the last results element, add text up to next entity
        if i != len(results) - 1:
            tokens.append(text[res.end:results[i+1].start])
        # if no more entities coming, add all remaining text
        else:
            tokens.append(text[res.end:])
    return tokens


st.set_page_config(page_title='Student Name Detector (English)', layout='wide')

# Side bar
st.sidebar.image('logo.png')

st.sidebar.markdown(
    '''Detect and anonymize PII in text using an [NLP model](https://huggingface.co/langdonholmes/en_student_name_detector) [trained](https://github.com/aialoe/deidentification-pipeline) on student-generated text collected from a massive online open-enrollment course.
'''
)

st_entities = st.sidebar.multiselect(
    label='Which entities to look for?',
    options=analyzer_engine().get_supported_entities(),
    default=list(analyzer_engine().get_supported_entities()),
)

st_threshold = st.sidebar.slider(
    label='Acceptance threshold', min_value=0.0, max_value=1.0, value=0.35
)

st_return_decision_process = st.sidebar.checkbox(
    'Add analysis explanations in json')

st.sidebar.info(
    'This is part of a project to develop new anonymization systems that are appropriate for student-generated text.'
)

# Main panel
analyzer_load_state = st.info(
    'Starting Presidio analyzer and loading Longformer-based model...')
engine = analyzer_engine()
analyzer_load_state.empty()


st_text = st.text_area(
    label='Type in some text',
    value='Learning Reflection\n\nWritten by John Williams and Samantha Morales\n\nIn this course I learned many things. As Liedtke (2004) said, \"Students grow when they learn\" (Erickson et al. 1998).\n\nBy John H. Williams -- (714) 328-9989 -- johnwilliams@yahoo.com',
    height=200,
)

button = st.button('Detect PII')

if 'first_load' not in st.session_state:
    st.session_state['first_load'] = True

# After
st.subheader('Analyzed')
with st.spinner('Analyzing...'):
    if button or st.session_state.first_load:
        st_analyze_results = analyzer_engine().analyze(
            text=st_text,
            entities=st_entities,
            language='en',
            score_threshold=st_threshold,
            return_decision_process=st_return_decision_process,
        )
        annotated_tokens = annotate(st_text, st_analyze_results, st_entities)
        # annotated_tokens
        annotated_text(*annotated_tokens)

# vertical space
st.text('')

st.subheader('Anonymized')
with st.spinner('Anonymizing...'):
    if button or st.session_state.first_load:
        st_anonymize_results = anonymizer_engine().anonymize(
                                         st_text,
                                         st_analyze_results)
        st_anonymize_results
        
# table result
st.subheader('Detailed Findings')
if st_analyze_results:
    res_dicts = [r.to_dict() for r in st_analyze_results]
    for d in res_dicts:
        d['Value'] = st_text[d['start']:d['end']]
    df = pd.DataFrame.from_records(res_dicts)
    df = df[['entity_type', 'Value', 'score', 'start', 'end']].rename(
        {
            'entity_type': 'Entity type',
            'start': 'Start',
            'end': 'End',
            'score': 'Confidence',
        },
        axis=1,
    )

    st.dataframe(df, width=1000)
else:
    st.text('No findings')

st.session_state['first_load'] = True

# json result
class ToDictListEncoder(JSONEncoder):
    '''Encode dict to json.'''

    def default(self, o):
        '''Encode to JSON using to_dict.'''
        if o:
            return o.to_dict()
        return []

if st_return_decision_process:
    st.json(json.dumps(st_analyze_results, cls=ToDictListEncoder))