from transformers import AutoTokenizer from transformers import AutoModelForSeq2SeqLM import streamlit as st import fitz # PyMuPDF from docx import Document import re import nltk nltk.download('punkt') def sentence_tokenize(text): sentences = nltk.sent_tokenize(text) return sentences model_dir_large = 'edithram23/Redaction_Personal_info_v1' tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large) model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large) def mask_generation(text,model=model_large,tokenizer=tokenizer_large): inputs = ["Mask Generation: " + text+'.'] inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt") output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text)) decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0] predicted_title = decoded_output.strip() pattern = r'\[.*?\]' # Replace all occurrences of the pattern with [redacted] redacted_text = re.sub(pattern, '[redacted]', predicted_title) return redacted_text def read_pdf(file): pdf_document = fitz.open(stream=file.read(), filetype="pdf") text = "" for page_num in range(len(pdf_document)): page = pdf_document.load_page(page_num) text += page.get_text() return text def read_docx(file): doc = Document(file) text = "\n".join([para.text for para in doc.paragraphs]) return text def read_txt(file): text = file.read().decode("utf-8") return text def process_file(file): if file.type == "application/pdf": return read_pdf(file) elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": return read_docx(file) elif file.type == "text/plain": return read_txt(file) else: return "Unsupported file type." st.title("File Reader") uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"]) if uploaded_file is not None: file_contents = process_file(uploaded_file) token = sentence_tokenize(file_contents) final='' for i in range(0, len(token)): final+=mask_generation(token[i])+'\n' processed_text = final st.text_area("File Contents", processed_text, height=400) st.download_button( label="Download Processed File", data=processed_text, file_name="processed_file.txt", mime="text/plain", )