edithram23's picture
Update app.py
aba3b27 verified
raw
history blame
2.73 kB
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
import streamlit as st
import fitz # PyMuPDF
from docx import Document
import re
import nltk
nltk.download('punkt')
def sentence_tokenize(text):
sentences = nltk.sent_tokenize(text)
return sentences
model_dir_large = 'edithram23/Redaction_Personal_info_v1'
tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large)
model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large)
def mask_generation(text,model=model_large,tokenizer=tokenizer_large):
if(len(text)<30):
text = text+'.'
inputs = ["Mask Generation: " + text.lower()+'.']
inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text))
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_title = decoded_output.strip()
pattern = r'\[.*?\]'
# Replace all occurrences of the pattern with [redacted]
redacted_text = re.sub(pattern, '[redacted]', predicted_title)
return redacted_text
def read_pdf(file):
pdf_document = fitz.open(stream=file.read(), filetype="pdf")
text = ""
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
text += page.get_text()
return text
def read_docx(file):
doc = Document(file)
text = "\n".join([para.text for para in doc.paragraphs])
return text
def read_txt(file):
text = file.read().decode("utf-8")
return text
def process_file(file):
if file.type == "application/pdf":
return read_pdf(file)
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
return read_docx(file)
elif file.type == "text/plain":
return read_txt(file)
else:
return "Unsupported file type."
st.title("File Reader")
user = st.text_input("Input Text to Redact")
uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"])
if(user != ''):
token = sentence_tokenize(user)
final=''
for i in range(0, len(token)):
final+=mask_generation(token[i])+'\n'
st.text_area("OUTPUT",final,height=400)
if uploaded_file is not None:
file_contents = process_file(uploaded_file)
token = sentence_tokenize(file_contents)
final=''
for i in range(0, len(token)):
final+=mask_generation(token[i])+'\n'
processed_text = final
st.text_area("OUTPUT", processed_text, height=400)
st.download_button(
label="Download Processed File",
data=processed_text,
file_name="processed_file.txt",
mime="text/plain",
)