Redaction_PDF_advanced

Sleeping

App Files Files Community

Redaction_PDF_advanced / app.py

edithram23

Upload app.py

7456815 verified 8 months ago

raw

history blame

2.53 kB

	from transformers import AutoTokenizer
	from transformers import AutoModelForSeq2SeqLM
	import streamlit as st
	import fitz # PyMuPDF
	from docx import Document
	import re
	import nltk
	nltk.download('punkt')

	def sentence_tokenize(text):
	sentences = nltk.sent_tokenize(text)
	return sentences

	model_dir_large = 'edithram23/Redaction_Personal_info_v1'
	tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large)
	model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large)

	def mask_generation(text,model=model_large,tokenizer=tokenizer_large):
	inputs = ["Mask Generation: " + text+'.']
	inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt")
	output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text))
	decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
	predicted_title = decoded_output.strip()
	pattern = r'\[.*?\]'
	# Replace all occurrences of the pattern with [redacted]
	redacted_text = re.sub(pattern, '[redacted]', predicted_title)
	return redacted_text



	def read_pdf(file):
	pdf_document = fitz.open(stream=file.read(), filetype="pdf")
	text = ""
	for page_num in range(len(pdf_document)):
	page = pdf_document.load_page(page_num)
	text += page.get_text()
	return text

	def read_docx(file):
	doc = Document(file)
	text = "\n".join([para.text for para in doc.paragraphs])
	return text

	def read_txt(file):
	text = file.read().decode("utf-8")
	return text

	def process_file(file):
	if file.type == "application/pdf":
	return read_pdf(file)
	elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
	return read_docx(file)
	elif file.type == "text/plain":
	return read_txt(file)
	else:
	return "Unsupported file type."

	st.title("File Reader")

	uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"])

	if uploaded_file is not None:
	file_contents = process_file(uploaded_file)
	token = sentence_tokenize(file_contents)
	final=''
	for i in range(0, len(token)):
	final+=mask_generation(token[i])+'\n'
	processed_text = final
	st.text_area("File Contents", processed_text, height=400)

	st.download_button(
	label="Download Processed File",
	data=processed_text,
	file_name="processed_file.txt",
	mime="text/plain",
	)