edithram23 commited on
Commit
40bbfdf
·
verified ·
1 Parent(s): 3d15ff1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -95
app.py CHANGED
@@ -1,7 +1,8 @@
1
- from transformers import AutoTokenizer
2
- from transformers import AutoModelForSeq2SeqLM
3
  import streamlit as st
4
  import fitz # PyMuPDF
 
 
5
  from docx import Document
6
  import re
7
  import nltk
@@ -11,24 +12,11 @@ def sentence_tokenize(text):
11
  sentences = nltk.sent_tokenize(text)
12
  return sentences
13
 
 
14
  model_dir_large = 'edithram23/Redaction_Personal_info_v1'
15
  tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large)
16
  model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large)
17
 
18
- # model_dir_small = 'edithram23/Redaction'
19
- # tokenizer_small = AutoTokenizer.from_pretrained(model_dir_small)
20
- # model_small = AutoModelForSeq2SeqLM.from_pretrained(model_dir_small)
21
-
22
- # def small(text, model=model_small, tokenizer=tokenizer_small):
23
- # inputs = ["Mask Generation: " + text.lower() + '.']
24
- # inputs = tokenizer(inputs, max_length=256, truncation=True, return_tensors="pt")
25
- # output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text))
26
- # decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
27
- # predicted_title = decoded_output.strip()
28
- # pattern = r'\[.*?\]'
29
- # redacted_text = re.sub(pattern, '[redacted]', predicted_title)
30
- # return redacted_text
31
-
32
  def mask_generation(text, model=model_large, tokenizer=tokenizer_large):
33
  if len(text) < 90:
34
  text = text + '.'
@@ -42,54 +30,7 @@ def mask_generation(text, model=model_large, tokenizer=tokenizer_large):
42
  redacted_text = re.sub(pattern, '[redacted]', predicted_title)
43
  return redacted_text
44
 
45
- def find_surrounding_words(text, target="[redacted]"):
46
- pattern = re.compile(r'([A-Za-z0-9_@#\$%\^&*\(\)\[\]\{\}\.\,]+)?\s*' + re.escape(target) + r'\s*([A-Za-z0-9_@#\$%\^&*\(\)\[\]\{\}\.\,]+)?')
47
- matches = pattern.finditer(text)
48
- results = []
49
- for match in matches:
50
- before, after = match.group(1), match.group(2)
51
-
52
- if before:
53
- before_parts = before.split(',')
54
- before_parts = [item for item in before_parts if item.strip()]
55
- if len(before_parts) > 1:
56
- before_word = before_parts[0].strip()
57
- before_index = match.start(1)
58
- else:
59
- before_word = before_parts[0]
60
- before_index = match.start(1)
61
- else:
62
- before_word = None
63
- before_index = None
64
-
65
- if after:
66
- after_parts = after.split(',')
67
- after_parts = [item for item in after_parts if item.strip()]
68
- if len(after_parts) > 1:
69
- after_word = after_parts[0].strip()
70
- after_index = match.start(2)
71
- else:
72
- after_word = after_parts[0]
73
- after_index = match.start(2)
74
- else:
75
- after_word = None
76
- after_index = None
77
-
78
- if match.start() == 0:
79
- before_word = None
80
- before_index = None
81
-
82
- if match.end() == len(text):
83
- after_word = None
84
- after_index = None
85
-
86
- results.append({
87
- "before_word": before_word,
88
- "after_word": after_word,
89
- "before_index": before_index,
90
- "after_index": after_index
91
- })
92
- return results
93
 
94
  def redact_text(page, text):
95
  text_instances = page.search_for(text)
@@ -132,37 +73,17 @@ if uploaded_file is not None:
132
  if pdf_document:
133
  redacted_text = []
134
  for page in pdf_document:
135
- pg = page.get_text()
136
- pg_lower = pg.lower()
137
- token = sentence_tokenize(pg)
138
- final = ''
139
- for t in token:
140
- t_lower = t.lower()
141
- final = mask_generation(t)
142
- words = find_surrounding_words(final)
143
- for i in range(len(words)):
144
- if words[i]['after_index'] is None:
145
- if words[i]['before_word'] in t_lower:
146
- fi = t_lower.index(words[i]['before_word'])
147
- fi = fi + len(words[i]['before_word'])
148
- li = len(t)
149
- redacted_text.append(t[fi:li])
150
- elif words[i]['before_index'] is None:
151
- if words[i]['after_word'] in t_lower:
152
- fi = 0
153
- li = t_lower.index(words[i]['after_word'])
154
- redacted_text.append(t[fi:li])
155
- else:
156
- if words[i]['after_word'] in t_lower and words[i]['before_word'] in t_lower:
157
- before_word = words[i]['before_word']
158
- after_word = words[i]['after_word']
159
- fi = t_lower.index(before_word)
160
- fi = fi + len(before_word)
161
- li = t_lower.index(after_word)
162
- redacted_text.append(t[fi:li])
163
- for page in pdf_document:
164
- for i in redacted_text:
165
- redact_text(page, i)
166
  output_pdf = "output_redacted.pdf"
167
  pdf_document.save(output_pdf)
168
 
 
1
+ from transformers import pipeline
 
2
  import streamlit as st
3
  import fitz # PyMuPDF
4
+ from transformers import AutoTokenizer
5
+ from transformers import AutoModelForSeq2SeqLM
6
  from docx import Document
7
  import re
8
  import nltk
 
12
  sentences = nltk.sent_tokenize(text)
13
  return sentences
14
 
15
+ # Use a pipeline as a high-level helper
16
  model_dir_large = 'edithram23/Redaction_Personal_info_v1'
17
  tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large)
18
  model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large)
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def mask_generation(text, model=model_large, tokenizer=tokenizer_large):
21
  if len(text) < 90:
22
  text = text + '.'
 
30
  redacted_text = re.sub(pattern, '[redacted]', predicted_title)
31
  return redacted_text
32
 
33
+ pipe1 = pipeline("token-classification", model="edithram23/new-bert-v2")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def redact_text(page, text):
36
  text_instances = page.search_for(text)
 
73
  if pdf_document:
74
  redacted_text = []
75
  for page in pdf_document:
76
+ final=[]
77
+ text = pg.get_text()
78
+ sentences = sentence_tokenize(text)
79
+ for sentence in sentences:
80
+ x=[pipe1(sentence)]
81
+ m = combine_words(x[0])
82
+ for j in m:
83
+ if(j['entity']!='none'):
84
+ final.append(j['word'])
85
+ for i in final:
86
+ redact_text(pg,i)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  output_pdf = "output_redacted.pdf"
88
  pdf_document.save(output_pdf)
89