edithram23 commited on
Commit
7456815
1 Parent(s): 51d9845

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -0
app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ from transformers import AutoModelForSeq2SeqLM
3
+ import streamlit as st
4
+ import fitz # PyMuPDF
5
+ from docx import Document
6
+ import re
7
+ import nltk
8
+ nltk.download('punkt')
9
+
10
+ def sentence_tokenize(text):
11
+ sentences = nltk.sent_tokenize(text)
12
+ return sentences
13
+
14
+ model_dir_large = 'edithram23/Redaction_Personal_info_v1'
15
+ tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large)
16
+ model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large)
17
+
18
+ def mask_generation(text,model=model_large,tokenizer=tokenizer_large):
19
+ inputs = ["Mask Generation: " + text+'.']
20
+ inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt")
21
+ output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text))
22
+ decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
23
+ predicted_title = decoded_output.strip()
24
+ pattern = r'\[.*?\]'
25
+ # Replace all occurrences of the pattern with [redacted]
26
+ redacted_text = re.sub(pattern, '[redacted]', predicted_title)
27
+ return redacted_text
28
+
29
+
30
+
31
+ def read_pdf(file):
32
+ pdf_document = fitz.open(stream=file.read(), filetype="pdf")
33
+ text = ""
34
+ for page_num in range(len(pdf_document)):
35
+ page = pdf_document.load_page(page_num)
36
+ text += page.get_text()
37
+ return text
38
+
39
+ def read_docx(file):
40
+ doc = Document(file)
41
+ text = "\n".join([para.text for para in doc.paragraphs])
42
+ return text
43
+
44
+ def read_txt(file):
45
+ text = file.read().decode("utf-8")
46
+ return text
47
+
48
+ def process_file(file):
49
+ if file.type == "application/pdf":
50
+ return read_pdf(file)
51
+ elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
52
+ return read_docx(file)
53
+ elif file.type == "text/plain":
54
+ return read_txt(file)
55
+ else:
56
+ return "Unsupported file type."
57
+
58
+ st.title("File Reader")
59
+
60
+ uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"])
61
+
62
+ if uploaded_file is not None:
63
+ file_contents = process_file(uploaded_file)
64
+ token = sentence_tokenize(file_contents)
65
+ final=''
66
+ for i in range(0, len(token)):
67
+ final+=mask_generation(token[i])+'\n'
68
+ processed_text = final
69
+ st.text_area("File Contents", processed_text, height=400)
70
+
71
+ st.download_button(
72
+ label="Download Processed File",
73
+ data=processed_text,
74
+ file_name="processed_file.txt",
75
+ mime="text/plain",
76
+ )