edithram23 commited on
Commit
adbaf3e
1 Parent(s): 7965ec3

PDF-download

Browse files
Files changed (1) hide show
  1. app.py +122 -34
app.py CHANGED
@@ -19,33 +19,83 @@ model_dir_small = 'edithram23/Redaction'
19
  tokenizer_small = AutoTokenizer.from_pretrained(model_dir_small)
20
  model_small = AutoModelForSeq2SeqLM.from_pretrained(model_dir_small)
21
 
22
- def small(text,model=model_small,tokenizer=tokenizer_small):
23
- inputs = ["Mask Generation: " + text.lower()+'.']
24
  inputs = tokenizer(inputs, max_length=256, truncation=True, return_tensors="pt")
25
  output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text))
26
  decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
27
  predicted_title = decoded_output.strip()
28
  pattern = r'\[.*?\]'
29
- # Replace all occurrences of the pattern with [redacted]
30
  redacted_text = re.sub(pattern, '[redacted]', predicted_title)
31
  return redacted_text
32
 
33
-
34
- def mask_generation(text,model=model_large,tokenizer=tokenizer_large):
35
- if(len(text)<200):
36
- text = text+'.'
37
  return small(text)
38
- inputs = ["Mask Generation: " + text.lower()+'.']
39
  inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt")
40
  output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text))
41
  decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
42
  predicted_title = decoded_output.strip()
43
  pattern = r'\[.*?\]'
44
- # Replace all occurrences of the pattern with [redacted]
45
  redacted_text = re.sub(pattern, '[redacted]', predicted_title)
46
  return redacted_text
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  def read_pdf(file):
51
  pdf_document = fitz.open(stream=file.read(), filetype="pdf")
@@ -53,7 +103,7 @@ def read_pdf(file):
53
  for page_num in range(len(pdf_document)):
54
  page = pdf_document.load_page(page_num)
55
  text += page.get_text()
56
- return text
57
 
58
  def read_docx(file):
59
  doc = Document(file)
@@ -68,33 +118,71 @@ def process_file(file):
68
  if file.type == "application/pdf":
69
  return read_pdf(file)
70
  elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
71
- return read_docx(file)
72
  elif file.type == "text/plain":
73
- return read_txt(file)
74
  else:
75
- return "Unsupported file type."
76
 
77
  st.title("Redaction")
78
- # user = st.text_input("Input Text to Redact")
79
  uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"])
80
- # if(user != ''):
81
- # token = sentence_tokenize(user)
82
- # final=''
83
- # for i in range(0, len(token)):
84
- # final+=mask_generation(token[i])+'\n'
85
- # st.text_area("OUTPUT",final,height=400)
86
  if uploaded_file is not None:
87
- file_contents = process_file(uploaded_file)
88
- token = sentence_tokenize(file_contents)
89
- final=''
90
- for i in range(0, len(token)):
91
- final+=mask_generation(token[i])+'\n'
92
- processed_text = final
93
- st.text_area("OUTPUT", processed_text, height=400)
94
-
95
- st.download_button(
96
- label="Download Processed File",
97
- data=processed_text,
98
- file_name="processed_file.txt",
99
- mime="text/plain",
100
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  tokenizer_small = AutoTokenizer.from_pretrained(model_dir_small)
20
  model_small = AutoModelForSeq2SeqLM.from_pretrained(model_dir_small)
21
 
22
+ def small(text, model=model_small, tokenizer=tokenizer_small):
23
+ inputs = ["Mask Generation: " + text.lower() + '.']
24
  inputs = tokenizer(inputs, max_length=256, truncation=True, return_tensors="pt")
25
  output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text))
26
  decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
27
  predicted_title = decoded_output.strip()
28
  pattern = r'\[.*?\]'
 
29
  redacted_text = re.sub(pattern, '[redacted]', predicted_title)
30
  return redacted_text
31
 
32
+ def mask_generation(text, model=model_large, tokenizer=tokenizer_large):
33
+ if len(text) < 200:
34
+ text = text + '.'
 
35
  return small(text)
36
+ inputs = ["Mask Generation: " + text.lower() + '.']
37
  inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt")
38
  output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text))
39
  decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
40
  predicted_title = decoded_output.strip()
41
  pattern = r'\[.*?\]'
 
42
  redacted_text = re.sub(pattern, '[redacted]', predicted_title)
43
  return redacted_text
44
 
45
+ def find_surrounding_words(text, target="[redacted]"):
46
+ pattern = re.compile(r'([A-Za-z0-9_@#\$%\^&*\(\)\[\]\{\}\.\,]+)?\s*' + re.escape(target) + r'\s*([A-Za-z0-9_@#\$%\^&*\(\)\[\]\{\}\.\,]+)?')
47
+ matches = pattern.finditer(text)
48
+ results = []
49
+ for match in matches:
50
+ before, after = match.group(1), match.group(2)
51
+
52
+ if before:
53
+ before_parts = before.split(',')
54
+ before_parts = [item for item in before_parts if item.strip()]
55
+ if len(before_parts) > 1:
56
+ before_word = before_parts[0].strip()
57
+ before_index = match.start(1)
58
+ else:
59
+ before_word = before_parts[0]
60
+ before_index = match.start(1)
61
+ else:
62
+ before_word = None
63
+ before_index = None
64
+
65
+ if after:
66
+ after_parts = after.split(',')
67
+ after_parts = [item for item in after_parts if item.strip()]
68
+ if len(after_parts) > 1:
69
+ after_word = after_parts[0].strip()
70
+ after_index = match.start(2)
71
+ else:
72
+ after_word = after_parts[0]
73
+ after_index = match.start(2)
74
+ else:
75
+ after_word = None
76
+ after_index = None
77
+
78
+ if match.start() == 0:
79
+ before_word = None
80
+ before_index = None
81
+
82
+ if match.end() == len(text):
83
+ after_word = None
84
+ after_index = None
85
 
86
+ results.append({
87
+ "before_word": before_word,
88
+ "after_word": after_word,
89
+ "before_index": before_index,
90
+ "after_index": after_index
91
+ })
92
+ return results
93
+
94
+ def redact_text(page, text):
95
+ text_instances = page.search_for(text)
96
+ for inst in text_instances:
97
+ page.add_redact_annot(inst, fill=(0, 0, 0))
98
+ page.apply_redactions()
99
 
100
  def read_pdf(file):
101
  pdf_document = fitz.open(stream=file.read(), filetype="pdf")
 
103
  for page_num in range(len(pdf_document)):
104
  page = pdf_document.load_page(page_num)
105
  text += page.get_text()
106
+ return text, pdf_document
107
 
108
  def read_docx(file):
109
  doc = Document(file)
 
118
  if file.type == "application/pdf":
119
  return read_pdf(file)
120
  elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
121
+ return read_docx(file), None
122
  elif file.type == "text/plain":
123
+ return read_txt(file), None
124
  else:
125
+ return "Unsupported file type.", None
126
 
127
  st.title("Redaction")
 
128
  uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"])
129
+
 
 
 
 
 
130
  if uploaded_file is not None:
131
+ file_contents, pdf_document = process_file(uploaded_file)
132
+ if pdf_document:
133
+ redacted_text = []
134
+ for page in pdf_document:
135
+ pg = page.get_text()
136
+ pg_lower = pg.lower()
137
+ token = sentence_tokenize(pg)
138
+ final = ''
139
+ for t in token:
140
+ t_lower = t.lower()
141
+ final = mask_generation(t)
142
+ words = find_surrounding_words(final)
143
+ for i in range(len(words)):
144
+ if words[i]['after_index'] is None:
145
+ if words[i]['before_word'] in t_lower:
146
+ fi = t_lower.index(words[i]['before_word'])
147
+ fi = fi + len(words[i]['before_word'])
148
+ li = len(t)
149
+ redacted_text.append(t[fi:li])
150
+ elif words[i]['before_index'] is None:
151
+ if words[i]['after_word'] in t_lower:
152
+ fi = 0
153
+ li = t_lower.index(words[i]['after_word'])
154
+ redacted_text.append(t[fi:li])
155
+ else:
156
+ if words[i]['after_word'] in t_lower and words[i]['before_word'] in t_lower:
157
+ before_word = words[i]['before_word']
158
+ after_word = words[i]['after_word']
159
+ fi = t_lower.index(before_word)
160
+ fi = fi + len(before_word)
161
+ li = t_lower.index(after_word)
162
+ redacted_text.append(t[fi:li])
163
+ for page in pdf_document:
164
+ for i in redacted_text:
165
+ redact_text(page, i)
166
+ output_pdf = "output_redacted.pdf"
167
+ pdf_document.save(output_pdf)
168
+
169
+ with open(output_pdf, "rb") as file:
170
+ st.download_button(
171
+ label="Download Processed PDF",
172
+ data=file,
173
+ file_name="processed_file.pdf",
174
+ mime="application/pdf",
175
+ )
176
+ else:
177
+ token = sentence_tokenize(file_contents)
178
+ final = ''
179
+ for i in range(0, len(token)):
180
+ final += mask_generation(token[i]) + '\n'
181
+ processed_text = final
182
+ st.text_area("OUTPUT", processed_text, height=400)
183
+ st.download_button(
184
+ label="Download Processed File",
185
+ data=processed_text,
186
+ file_name="processed_file.txt",
187
+ mime="text/plain",
188
+ )