edithram23 commited on
Commit
748048a
1 Parent(s): 2c79e12

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -21
app.py CHANGED
@@ -1,5 +1,4 @@
1
- from transformers import AutoTokenizer
2
- from transformers import AutoModelForSeq2SeqLM
3
  import streamlit as st
4
  import fitz # PyMuPDF
5
  from docx import Document
@@ -16,6 +15,7 @@ def sentence_tokenize(text):
16
  model_dir_large = 'edithram23/Redaction_Personal_info_v1'
17
  tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large)
18
  model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large)
 
19
 
20
  # model_dir_small = 'edithram23/Redaction'
21
  # tokenizer_small = AutoTokenizer.from_pretrained(model_dir_small)
@@ -42,6 +42,50 @@ address_recognizer = PatternRecognizer(supported_entity="ADDRESS", patterns=[add
42
  analyzer.registry.add_recognizer(address_recognizer)
43
  analyzer.get_recognizers
44
  # Define a function to extract entities
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  def extract_entities(text):
46
  entities = {
47
  "NAME": [],
@@ -132,25 +176,22 @@ if uploaded_file is not None:
132
  if pdf_document:
133
  redacted_text = []
134
  for pg in pdf_document:
135
- text = pg.get_text('text')
136
- sentences = sentence_tokenize(text)
137
- for sent in sentences:
138
- entities,words_out = extract_entities(sent)
139
- avai_red = pg.search_for(sent)
140
- new=[]
141
- for w in words_out:
142
-
143
- new+=w.split('\n')
144
- words_out = [i for i in new if len(i)>2]
145
- print(words_out)
146
- for i in avai_red:
147
- b = pg.get_text("text", clip=i)
148
- # result = [item for item in output if item in b] # Get elements of 'a' that are in 'b'
149
- for j in words_out:
150
- new_n = pg.search_for(j, clip=i)
151
- for all in new_n:
152
- pg.add_redact_annot(all,fill=(0, 0, 0))
153
- pg.apply_redactions()
154
 
155
  output_pdf = "output_redacted.pdf"
156
  pdf_document.save(output_pdf)
 
1
+ from transformers import pipeline
 
2
  import streamlit as st
3
  import fitz # PyMuPDF
4
  from docx import Document
 
15
  model_dir_large = 'edithram23/Redaction_Personal_info_v1'
16
  tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large)
17
  model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large)
18
+ pipe1 = pipeline("token-classification", model="edithram23/new-bert-v2")
19
 
20
  # model_dir_small = 'edithram23/Redaction'
21
  # tokenizer_small = AutoTokenizer.from_pretrained(model_dir_small)
 
42
  analyzer.registry.add_recognizer(address_recognizer)
43
  analyzer.get_recognizers
44
  # Define a function to extract entities
45
+
46
+
47
+ def combine_words(entities):
48
+ combined_entities = []
49
+ current_entity = None
50
+
51
+ for entity in entities:
52
+ if current_entity:
53
+ if current_entity['end'] == entity['start']:
54
+ # Combine the words without space
55
+ current_entity['word'] += entity['word'].replace('##', '')
56
+ current_entity['end'] = entity['end']
57
+ elif current_entity['end'] + 1 == entity['start']:
58
+ # Combine the words with a space
59
+ current_entity['word'] += ' ' + entity['word'].replace('##', '')
60
+ current_entity['end'] = entity['end']
61
+ else:
62
+ # Add the previous combined entity to the list
63
+ combined_entities.append(current_entity)
64
+ # Start a new entity
65
+ current_entity = entity.copy()
66
+ current_entity['word'] = current_entity['word'].replace('##', '')
67
+ else:
68
+ # Initialize the first entity
69
+ current_entity = entity.copy()
70
+ current_entity['word'] = current_entity['word'].replace('##', '')
71
+
72
+ # Add the last entity
73
+ if current_entity:
74
+ combined_entities.append(current_entity)
75
+
76
+ return combined_entities
77
+
78
+ def words_red_bert(text):
79
+ final=[]
80
+ sentences = sentence_tokenize(text)
81
+ for sentence in sentences:
82
+ x=[pipe1(sentence)]
83
+ m = combine_words(x[0])
84
+ for j in m:
85
+ if(j['entity']!='none' and len(j['word'])>1 and j['word']!=', '):
86
+ final.append(j['word'])
87
+ return final
88
+
89
  def extract_entities(text):
90
  entities = {
91
  "NAME": [],
 
176
  if pdf_document:
177
  redacted_text = []
178
  for pg in pdf_document:
179
+ text = pg.get_text('text')
180
+ sentences = sentence_tokenize(text)
181
+ for sent in sentences:
182
+ entities,words_out = extract_entities(sent)
183
+ bert_words = words_red_bert(sent)
184
+ new=[]
185
+ for w in words_out:
186
+ new+=w.split('\n')
187
+ words_out+=bert_words
188
+ words_out = [i for i in new if len(i)>2]
189
+
190
+ # print(words_out)
191
+ words_out=sorted(words_out, key=len,reverse=True)
192
+ print(words_out)
193
+ for i in words_out:
194
+ redact_text(pg,i)
 
 
 
195
 
196
  output_pdf = "output_redacted.pdf"
197
  pdf_document.save(output_pdf)