ambreshrc commited on
Commit
b478097
1 Parent(s): 2c2e857

Create new file

Browse files
Files changed (1) hide show
  1. app.py +93 -0
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import gradio as gr
2
+ # Def_04 Docx file to translated_Docx file
3
+ from transformers import MarianMTModel, MarianTokenizer
4
+ import nltk
5
+ from nltk.tokenize import sent_tokenize
6
+ from nltk.tokenize import LineTokenizer
7
+ nltk.download('punkt')
8
+ import math
9
+ import torch
10
+ from docx import Document
11
+ from time import sleep
12
+
13
+ import docx
14
+ def getText(filename):
15
+ doc = docx.Document(filename)
16
+ fullText = []
17
+ for para in doc.paragraphs:
18
+ fullText.append(para.text)
19
+ return '\n'.join(fullText)
20
+
21
+ # Def_01 applying process bar to function
22
+ import sys
23
+
24
+ def print_progress_bar(index, total, label):
25
+ n_bar = 50 # Progress bar width
26
+ progress = index / total
27
+ sys.stdout.write('\r')
28
+ sys.stdout.write(f"[{'=' * int(n_bar * progress):{n_bar}s}] {int(100 * progress)}% {label}")
29
+ sys.stdout.flush()
30
+
31
+
32
+
33
+
34
+ if torch.cuda.is_available():
35
+ dev = "cuda"
36
+ else:
37
+ dev = "cpu"
38
+ device = torch.device(dev)
39
+
40
+ mname = '/content/drive/MyDrive/Transformers Models/opus-mt-en-hi-Trans Model'
41
+ tokenizer = MarianTokenizer.from_pretrained(mname)
42
+ model = MarianMTModel.from_pretrained(mname)
43
+ model.to(device)
44
+
45
+
46
+ def btTranslator(docxfile):
47
+ a=getText(docxfile)
48
+ a1=a.split('\n')
49
+ bigtext=''' '''
50
+ for a in a1:
51
+ bigtext=bigtext+'\n'+a
52
+ files=Document()
53
+ lt = LineTokenizer()
54
+ batch_size = 8
55
+ paragraphs = lt.tokenize(bigtext)
56
+ translated_paragraphs = []
57
+
58
+
59
+ for index, paragraph in enumerate(paragraphs):
60
+ # ######################################
61
+ total=len(paragraphs)
62
+ print_progress_bar(index, total, "Percentage Bar")
63
+ sleep(0.5)
64
+
65
+ # ######################################
66
+ sentences = sent_tokenize(paragraph)
67
+ batches = math.ceil(len(sentences) / batch_size)
68
+ translated = []
69
+ for i in range(batches):
70
+ sent_batch = sentences[i*batch_size:(i+1)*batch_size]
71
+ model_inputs = tokenizer(sent_batch, return_tensors="pt", padding=True, truncation=True, max_length=500).to(device)
72
+ with torch.no_grad():
73
+ translated_batch = model.generate(**model_inputs)
74
+ translated += translated_batch
75
+ translated = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
76
+ translated_paragraphs += [" ".join(translated)]
77
+ files.add_paragraph(translated)
78
+ # translated_text = "\n".join(translated_paragraphs)
79
+
80
+ f=files.save(f"Translated_{docxfile[23:]}")
81
+ return translated_paragraphs,f
82
+
83
+
84
+ import gradio as gr
85
+ interface = gr.Interface(fn=btTranslator,
86
+ inputs=gr.inputs.Textbox(lines=1),
87
+ # inputs = gr.inputs.File(file_count="multiple",label="Input Files"),
88
+ # inputs=
89
+ outputs=['text','file'],
90
+ show_progress=True
91
+ )
92
+
93
+ interface.launch(debug=True)