pszemraj commited on
Commit
3927544
β€’
1 Parent(s): c006617

🎨 clean up code

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (2) hide show
  1. app.py +21 -17
  2. pdf2text.py +40 -94
app.py CHANGED
@@ -78,11 +78,11 @@ def predict(
78
  def proc_submission(
79
  input_text: str,
80
  model_name: str,
81
- num_beams,
82
- token_batch_length,
83
- length_penalty,
84
- repetition_penalty,
85
- no_repeat_ngram_size,
86
  max_input_length: int = 1024,
87
  ):
88
  """
@@ -117,7 +117,7 @@ def proc_submission(
117
  history = {}
118
  clean_text = clean(input_text, lower=False)
119
  max_input_length = 2048 if "base" in model_name.lower() else max_input_length
120
- processed = truncate_word_count(clean_text, max_input_length)
121
 
122
  if processed["was_truncated"]:
123
  tr_in = processed["truncated_text"]
@@ -184,7 +184,7 @@ def proc_submission(
184
 
185
  def load_single_example_text(
186
  example_path: str or Path,
187
- max_pages=20,
188
  ) -> str:
189
  """
190
  load_single_example_text - loads a single example text file
@@ -279,13 +279,19 @@ if __name__ == "__main__":
279
  with gr.Row(variant="compact"):
280
  with gr.Column(scale=0.5, variant="compact"):
281
  model_name = gr.Dropdown(
282
- choices=MODEL_OPTIONS, value=MODEL_OPTIONS[0], label="Model"
 
 
283
  )
284
  num_beams = gr.Radio(
285
  choices=[2, 3, 4],
286
  label="Beam Search: # of Beams",
287
  value=2,
288
  )
 
 
 
 
289
  with gr.Column(variant="compact"):
290
  example_name = gr.Dropdown(
291
  _examples,
@@ -303,11 +309,6 @@ if __name__ == "__main__":
303
  label="Input Text (for summarization)",
304
  placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
305
  )
306
- with gr.Column(min_width=100, scale=0.5):
307
- load_examples_button = gr.Button(
308
- "Load Example",
309
- )
310
- load_file_button = gr.Button("Upload File")
311
 
312
  with gr.Column():
313
  gr.Markdown("## Generate Summary")
@@ -332,7 +333,7 @@ if __name__ == "__main__":
332
  )
333
 
334
  text_file = gr.File(
335
- label="Download Summary as Text File",
336
  file_count="single",
337
  type="file",
338
  interactive=False,
@@ -342,7 +343,7 @@ if __name__ == "__main__":
342
  with gr.Column():
343
  gr.Markdown("### Advanced Settings")
344
  with gr.Row(variant="compact"):
345
- length_penalty = gr.inputs.Slider(
346
  minimum=0.5,
347
  maximum=1.0,
348
  label="length penalty",
@@ -356,7 +357,7 @@ if __name__ == "__main__":
356
  )
357
 
358
  with gr.Row(variant="compact"):
359
- repetition_penalty = gr.inputs.Slider(
360
  minimum=1.0,
361
  maximum=5.0,
362
  label="repetition penalty",
@@ -371,7 +372,10 @@ if __name__ == "__main__":
371
  with gr.Column():
372
  gr.Markdown("### About")
373
  gr.Markdown(
374
- "These models are fine-tuned on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
 
 
 
375
  )
376
  gr.Markdown("---")
377
 
 
78
  def proc_submission(
79
  input_text: str,
80
  model_name: str,
81
+ num_beams: int,
82
+ token_batch_length: int,
83
+ length_penalty: float,
84
+ repetition_penalty: float,
85
+ no_repeat_ngram_size: int,
86
  max_input_length: int = 1024,
87
  ):
88
  """
 
117
  history = {}
118
  clean_text = clean(input_text, lower=False)
119
  max_input_length = 2048 if "base" in model_name.lower() else max_input_length
120
+ processed = truncate_word_count(clean_text, max_words=max_input_length)
121
 
122
  if processed["was_truncated"]:
123
  tr_in = processed["truncated_text"]
 
184
 
185
  def load_single_example_text(
186
  example_path: str or Path,
187
+ max_pages: int = 20,
188
  ) -> str:
189
  """
190
  load_single_example_text - loads a single example text file
 
279
  with gr.Row(variant="compact"):
280
  with gr.Column(scale=0.5, variant="compact"):
281
  model_name = gr.Dropdown(
282
+ choices=MODEL_OPTIONS,
283
+ value=MODEL_OPTIONS[0],
284
+ label="Model Name",
285
  )
286
  num_beams = gr.Radio(
287
  choices=[2, 3, 4],
288
  label="Beam Search: # of Beams",
289
  value=2,
290
  )
291
+ load_examples_button = gr.Button(
292
+ "Load Example in Dropdown",
293
+ )
294
+ load_file_button = gr.Button("Load an Uploaded File")
295
  with gr.Column(variant="compact"):
296
  example_name = gr.Dropdown(
297
  _examples,
 
309
  label="Input Text (for summarization)",
310
  placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
311
  )
 
 
 
 
 
312
 
313
  with gr.Column():
314
  gr.Markdown("## Generate Summary")
 
333
  )
334
 
335
  text_file = gr.File(
336
+ label="Download as Text File",
337
  file_count="single",
338
  type="file",
339
  interactive=False,
 
343
  with gr.Column():
344
  gr.Markdown("### Advanced Settings")
345
  with gr.Row(variant="compact"):
346
+ length_penalty = gr.Slider(
347
  minimum=0.5,
348
  maximum=1.0,
349
  label="length penalty",
 
357
  )
358
 
359
  with gr.Row(variant="compact"):
360
+ repetition_penalty = gr.Slider(
361
  minimum=1.0,
362
  maximum=5.0,
363
  label="repetition penalty",
 
372
  with gr.Column():
373
  gr.Markdown("### About")
374
  gr.Markdown(
375
+ "- Models are fine-tuned on the [BookSum dataset](https://arxiv.org/abs/2105.08209). The goal was to create a model that generalizes well and is useful for summarizing text in academic and everyday use."
376
+ )
377
+ gr.Markdown(
378
+ "- _Update April 2023:_ Additional models fine-tuned on the [PLOS](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-plos-norm) and [ELIFE](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-elife-norm) subsets of the [scientific lay summaries](https://arxiv.org/abs/2210.09932) dataset are available (see dropdown at the top)."
379
  )
380
  gr.Markdown("---")
381
 
pdf2text.py CHANGED
@@ -1,10 +1,15 @@
1
  # -*- coding: utf-8 -*-
2
  """
3
-
4
- easyocr.py - A wrapper for easyocr to convert pdf to images to text
5
  """
6
-
7
  import logging
 
 
 
 
 
 
 
8
  from pathlib import Path
9
 
10
  logging.basicConfig(
@@ -14,25 +19,18 @@ logging.basicConfig(
14
  )
15
 
16
 
17
- import os
18
- import pprint as pp
19
- import re
20
- import shutil
21
- import time
22
- from datetime import date, datetime
23
- from os.path import basename, dirname, join
24
- from pathlib import Path
25
 
26
  from cleantext import clean
27
  from doctr.io import DocumentFile
28
  from doctr.models import ocr_predictor
29
  from libretranslatepy import LibreTranslateAPI
30
- from natsort import natsorted
31
  from spellchecker import SpellChecker
32
  from tqdm.auto import tqdm
33
 
34
 
35
  def simple_rename(filepath, target_ext=".txt"):
 
36
  _fp = Path(filepath)
37
  basename = _fp.stem
38
  return f"OCR_{basename}_{target_ext}"
@@ -41,9 +39,6 @@ def simple_rename(filepath, target_ext=".txt"):
41
  def rm_local_text_files(name_contains="RESULT_"):
42
  """
43
  rm_local_text_files - remove local text files
44
-
45
- Args:
46
- name_contains (str, optional): [description]. Defaults to "OCR_".
47
  """
48
  files = [
49
  f
@@ -91,17 +86,12 @@ def corr(
91
  return s
92
 
93
 
94
- def fix_punct_spaces(string):
95
  """
96
- fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
97
-
98
- Parameters
99
- ----------
100
- string : str, required, input string to be corrected
101
 
102
- Returns
103
- -------
104
- str, corrected string
105
  """
106
 
107
  fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
@@ -111,17 +101,12 @@ def fix_punct_spaces(string):
111
  return string.strip()
112
 
113
 
114
- def clean_OCR(ugly_text: str):
115
  """
116
- clean_OCR - clean the OCR text files.
117
 
118
- Parameters
119
- ----------
120
- ugly_text : str, required, input string to be cleaned
121
-
122
- Returns
123
- -------
124
- str, cleaned string
125
  """
126
  # Remove all the newlines.
127
  cleaned_text = ugly_text.replace("\n", " ")
@@ -137,9 +122,12 @@ def clean_OCR(ugly_text: str):
137
  return fix_punct_spaces(cleaned_text)
138
 
139
 
140
- def move2completed(from_dir, filename, new_folder="completed", verbose=False):
141
-
142
- # this is the better version
 
 
 
143
  old_filepath = join(from_dir, filename)
144
 
145
  new_filedirectory = join(from_dir, new_folder)
@@ -161,11 +149,6 @@ def move2completed(from_dir, filename, new_folder="completed", verbose=False):
161
  )
162
 
163
 
164
- """## pdf2text functions
165
-
166
- """
167
-
168
-
169
  custom_replace_list = {
170
  "t0": "to",
171
  "'$": "'s",
@@ -239,17 +222,16 @@ def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
239
  """
240
  cleantxt_ocr - clean text from OCR
241
 
 
242
  Args:
243
  ugly_text (str): text to clean
244
- lower (bool, optional): _description_. Defaults to False.
245
- lang (str, optional): _description_. Defaults to "en".
246
 
247
  Returns:
248
  str: cleaned text
249
  """
250
- # a wrapper for clean text with options different than default
251
 
252
- # https://pypi.org/project/clean-text/
253
  cleaned_text = clean(
254
  ugly_text,
255
  fix_unicode=True, # fix various unicode errors
@@ -258,18 +240,15 @@ def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
258
  no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
259
  no_urls=True, # replace all URLs with a special token
260
  no_emails=True, # replace all email addresses with a special token
261
- no_phone_numbers=False, # replace all phone numbers with a special token
262
  no_numbers=False, # replace all numbers with a special token
263
  no_digits=False, # replace all digits with a special token
264
  no_currency_symbols=False, # replace all currency symbols with a special token
265
  no_punct=False, # remove punctuations
266
  replace_with_punct="", # instead of removing punctuations you may replace them
267
- replace_with_url="<URL>",
268
- replace_with_email="<EMAIL>",
269
- replace_with_phone_number="<PHONE>",
270
- replace_with_number="<NUM>",
271
- replace_with_digit="0",
272
- replace_with_currency_symbol="<CUR>",
273
  lang=lang, # set to 'de' for German special handling
274
  )
275
 
@@ -277,7 +256,7 @@ def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
277
 
278
 
279
  def format_ocr_out(OCR_data):
280
-
281
  if isinstance(OCR_data, list):
282
  text = " ".join(OCR_data)
283
  else:
@@ -323,8 +302,15 @@ def convert_PDF_to_Text(
323
  PDF_file,
324
  ocr_model=None,
325
  max_pages: int = 20,
326
- ):
 
 
327
 
 
 
 
 
 
328
  st = time.perf_counter()
329
  PDF_file = Path(PDF_file)
330
  ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
@@ -361,43 +347,3 @@ def convert_PDF_to_Text(
361
  }
362
 
363
  return results_dict
364
-
365
-
366
- # @title translation functions
367
-
368
- lt = LibreTranslateAPI("https://translate.astian.org/")
369
-
370
-
371
- def translate_text(text, source_l, target_l="en"):
372
-
373
- return str(lt.translate(text, source_l, target_l))
374
-
375
-
376
- def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
377
- """translate a document from lang_start to lang_end
378
-
379
- {'code': 'en', 'name': 'English'},
380
- {'code': 'fr', 'name': 'French'},
381
- {'code': 'de', 'name': 'German'},
382
- {'code': 'it', 'name': 'Italian'},"""
383
-
384
- src_folder = dirname(filepath)
385
- src_folder = Path(src_folder)
386
- trgt_folder = src_folder / f"translated_{lang_end}"
387
- trgt_folder.mkdir(exist_ok=True)
388
- with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
389
- foreign_t = f.readlines()
390
- in_name = basename(filepath)
391
- translated_doc = []
392
- for line in tqdm(
393
- foreign_t, total=len(foreign_t), desc="translating {}...".format(in_name[:10])
394
- ):
395
- translated_line = translate_text(line, lang_start, lang_end)
396
- translated_doc.append(translated_line)
397
- t_out_name = "[To {}]".format(lang_end) + simple_rename(in_name) + ".txt"
398
- out_path = join(trgt_folder, t_out_name)
399
- with open(out_path, "w", encoding="utf-8", errors="ignore") as f_o:
400
- f_o.writelines(translated_doc)
401
- if verbose:
402
- print("finished translating the document! - ", datetime.now())
403
- return out_path
 
1
  # -*- coding: utf-8 -*-
2
  """
3
+ pdf2text.py - convert pdf files to text files using OCR
 
4
  """
 
5
  import logging
6
+ import os
7
+ import pprint as pp
8
+ import re
9
+ import shutil
10
+ import time
11
+ from datetime import date, datetime
12
+ from os.path import basename, dirname, join
13
  from pathlib import Path
14
 
15
  logging.basicConfig(
 
19
  )
20
 
21
 
22
+ os.environ["USE_TORCH"] = "1"
 
 
 
 
 
 
 
23
 
24
  from cleantext import clean
25
  from doctr.io import DocumentFile
26
  from doctr.models import ocr_predictor
27
  from libretranslatepy import LibreTranslateAPI
 
28
  from spellchecker import SpellChecker
29
  from tqdm.auto import tqdm
30
 
31
 
32
  def simple_rename(filepath, target_ext=".txt"):
33
+ """simple_rename - get a new str to rename a file"""
34
  _fp = Path(filepath)
35
  basename = _fp.stem
36
  return f"OCR_{basename}_{target_ext}"
 
39
  def rm_local_text_files(name_contains="RESULT_"):
40
  """
41
  rm_local_text_files - remove local text files
 
 
 
42
  """
43
  files = [
44
  f
 
86
  return s
87
 
88
 
89
+ def fix_punct_spaces(string: str) -> str:
90
  """
91
+ fix_punct_spaces - fix spaces around punctuation
 
 
 
 
92
 
93
+ :param str string: input string
94
+ :return str: string with spaces fixed
 
95
  """
96
 
97
  fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
 
101
  return string.strip()
102
 
103
 
104
+ def clean_OCR(ugly_text: str) -> str:
105
  """
106
+ clean_OCR - clean up the OCR text
107
 
108
+ :param str ugly_text: input text to be cleaned
109
+ :return str: cleaned text
 
 
 
 
 
110
  """
111
  # Remove all the newlines.
112
  cleaned_text = ugly_text.replace("\n", " ")
 
122
  return fix_punct_spaces(cleaned_text)
123
 
124
 
125
+ def move2completed(
126
+ from_dir, filename, new_folder: str = "completed", verbose: bool = False
127
+ ):
128
+ """
129
+ move2completed - move a file to a new folder
130
+ """
131
  old_filepath = join(from_dir, filename)
132
 
133
  new_filedirectory = join(from_dir, new_folder)
 
149
  )
150
 
151
 
 
 
 
 
 
152
  custom_replace_list = {
153
  "t0": "to",
154
  "'$": "'s",
 
222
  """
223
  cleantxt_ocr - clean text from OCR
224
 
225
+ https://pypi.org/project/clean-text/
226
  Args:
227
  ugly_text (str): text to clean
228
+ lower (bool, optional): lowercase text. Defaults to False.
229
+ lang (str, optional): language of text. Defaults to "en".
230
 
231
  Returns:
232
  str: cleaned text
233
  """
 
234
 
 
235
  cleaned_text = clean(
236
  ugly_text,
237
  fix_unicode=True, # fix various unicode errors
 
240
  no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
241
  no_urls=True, # replace all URLs with a special token
242
  no_emails=True, # replace all email addresses with a special token
243
+ no_phone_numbers=True, # replace all phone numbers with a special token
244
  no_numbers=False, # replace all numbers with a special token
245
  no_digits=False, # replace all digits with a special token
246
  no_currency_symbols=False, # replace all currency symbols with a special token
247
  no_punct=False, # remove punctuations
248
  replace_with_punct="", # instead of removing punctuations you may replace them
249
+ replace_with_url="this url",
250
+ replace_with_email="this email",
251
+ replace_with_phone_number="this phone number",
 
 
 
252
  lang=lang, # set to 'de' for German special handling
253
  )
254
 
 
256
 
257
 
258
  def format_ocr_out(OCR_data):
259
+ """format OCR output to text"""
260
  if isinstance(OCR_data, list):
261
  text = " ".join(OCR_data)
262
  else:
 
302
  PDF_file,
303
  ocr_model=None,
304
  max_pages: int = 20,
305
+ ) -> str:
306
+ """
307
+ convert_PDF_to_Text - convert a PDF file to text
308
 
309
+ :param str PDF_file: path to PDF file
310
+ :param ocr_model: model to use for OCR, defaults to None (uses the default model)
311
+ :param int max_pages: maximum number of pages to process, defaults to 20
312
+ :return str: text from PDF
313
+ """
314
  st = time.perf_counter()
315
  PDF_file = Path(PDF_file)
316
  ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
 
347
  }
348
 
349
  return results_dict