edithram23's picture
Upload app.py
7456815 verified
raw
history blame
2.53 kB
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
import streamlit as st
import fitz # PyMuPDF
from docx import Document
import re
import nltk
nltk.download('punkt')
def sentence_tokenize(text):
sentences = nltk.sent_tokenize(text)
return sentences
model_dir_large = 'edithram23/Redaction_Personal_info_v1'
tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large)
model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large)
def mask_generation(text,model=model_large,tokenizer=tokenizer_large):
inputs = ["Mask Generation: " + text+'.']
inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text))
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_title = decoded_output.strip()
pattern = r'\[.*?\]'
# Replace all occurrences of the pattern with [redacted]
redacted_text = re.sub(pattern, '[redacted]', predicted_title)
return redacted_text
def read_pdf(file):
pdf_document = fitz.open(stream=file.read(), filetype="pdf")
text = ""
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
text += page.get_text()
return text
def read_docx(file):
doc = Document(file)
text = "\n".join([para.text for para in doc.paragraphs])
return text
def read_txt(file):
text = file.read().decode("utf-8")
return text
def process_file(file):
if file.type == "application/pdf":
return read_pdf(file)
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
return read_docx(file)
elif file.type == "text/plain":
return read_txt(file)
else:
return "Unsupported file type."
st.title("File Reader")
uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"])
if uploaded_file is not None:
file_contents = process_file(uploaded_file)
token = sentence_tokenize(file_contents)
final=''
for i in range(0, len(token)):
final+=mask_generation(token[i])+'\n'
processed_text = final
st.text_area("File Contents", processed_text, height=400)
st.download_button(
label="Download Processed File",
data=processed_text,
file_name="processed_file.txt",
mime="text/plain",
)