Spaces:
Sleeping
Sleeping
from transformers import AutoTokenizer | |
from transformers import AutoModelForSeq2SeqLM | |
import streamlit as st | |
import fitz # PyMuPDF | |
from docx import Document | |
import re | |
import nltk | |
nltk.download('punkt') | |
def sentence_tokenize(text): | |
sentences = nltk.sent_tokenize(text) | |
return sentences | |
model_dir_large = 'edithram23/Redaction_Personal_info_v1' | |
tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large) | |
model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large) | |
def mask_generation(text,model=model_large,tokenizer=tokenizer_large): | |
if(len(text)<30): | |
text = text+'.' | |
inputs = ["Mask Generation: " + text.lower()+'.'] | |
inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt") | |
output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text)) | |
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0] | |
predicted_title = decoded_output.strip() | |
pattern = r'\[.*?\]' | |
# Replace all occurrences of the pattern with [redacted] | |
redacted_text = re.sub(pattern, '[redacted]', predicted_title) | |
return redacted_text | |
def read_pdf(file): | |
pdf_document = fitz.open(stream=file.read(), filetype="pdf") | |
text = "" | |
for page_num in range(len(pdf_document)): | |
page = pdf_document.load_page(page_num) | |
text += page.get_text() | |
return text | |
def read_docx(file): | |
doc = Document(file) | |
text = "\n".join([para.text for para in doc.paragraphs]) | |
return text | |
def read_txt(file): | |
text = file.read().decode("utf-8") | |
return text | |
def process_file(file): | |
if file.type == "application/pdf": | |
return read_pdf(file) | |
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
return read_docx(file) | |
elif file.type == "text/plain": | |
return read_txt(file) | |
else: | |
return "Unsupported file type." | |
st.title("File Reader") | |
user = st.text_input("Input Text to Redact") | |
uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"]) | |
if(user != ''): | |
token = sentence_tokenize(user) | |
final='' | |
for i in range(0, len(token)): | |
final+=mask_generation(token[i])+'\n' | |
st.text_area("OUTPUT",final,height=400) | |
if uploaded_file is not None: | |
file_contents = process_file(uploaded_file) | |
token = sentence_tokenize(file_contents) | |
final='' | |
for i in range(0, len(token)): | |
final+=mask_generation(token[i])+'\n' | |
processed_text = final | |
st.text_area("OUTPUT", processed_text, height=400) | |
st.download_button( | |
label="Download Processed File", | |
data=processed_text, | |
file_name="processed_file.txt", | |
mime="text/plain", | |
) | |