Spaces:

pszemraj
/

document-summarization

Running on CPU Upgrade

App Files Files Community

document-summarization / utils.py

pszemraj

🎨 improve formatting

e219aa1 over 1 year ago

raw

history blame

4.11 kB

	"""
	utils.py - Utility functions for the project.
	"""

	import re
	import subprocess
	from datetime import datetime
	from pathlib import Path

	import torch
	from natsort import natsorted


	def validate_pytorch2(torch_version: str = None):
	torch_version = torch.__version__ if torch_version is None else torch_version

	pattern = r"^2\.\d+(\.\d+)*"

	return True if re.match(pattern, torch_version) else False


	def get_timestamp() -> str:
	"""
	get_timestamp - get a timestamp for the current time
	Returns:
	str, the timestamp
	"""
	return datetime.now().strftime("%Y%m%d_%H%M%S")


	def truncate_word_count(text, max_words=512):
	"""
	truncate_word_count - a helper function for the gradio module
	Parameters
	----------
	text : str, required, the text to be processed
	max_words : int, optional, the maximum number of words, default=512
	Returns
	-------
	dict, the text and whether it was truncated
	"""
	# split on whitespace with regex
	words = re.split(r"\s+", text)
	processed = {}
	if len(words) > max_words:
	processed["was_truncated"] = True
	processed["truncated_text"] = " ".join(words[:max_words])
	else:
	processed["was_truncated"] = False
	processed["truncated_text"] = text
	return processed


	def load_examples(src, filetypes=[".txt", ".pdf"]):
	"""
	load_examples - a helper function for the gradio module to load examples
	Returns:
	list of str, the examples
	"""
	src = Path(src)
	src.mkdir(exist_ok=True)

	pdf_url = (
	"https://www.dropbox.com/s/y92xy7o5qb88yij/all_you_need_is_attention.pdf?dl=1"
	)
	subprocess.run(["wget", pdf_url, "-O", src / "all_you_need_is_attention.pdf"])
	examples = [f for f in src.iterdir() if f.suffix in filetypes]
	examples = natsorted(examples)
	# load the examples into a list
	text_examples = []
	for example in examples:
	with open(example, "r") as f:
	text = f.read()
	text_examples.append([text, "base", 2, 1024, 0.7, 3.5, 3])

	return text_examples


	def load_example_filenames(example_path: str or Path):
	"""
	load_example_filenames - a helper function for the gradio module to load examples
	Returns:
	dict, the examples (filename:full path)
	"""
	example_path = Path(example_path)
	# load the examples into a list
	examples = {f.name: f for f in example_path.glob("*.txt")}
	return examples


	def saves_summary(
	summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
	):
	"""
	saves_summary - save the summary generated from summarize_via_tokenbatches() to a text file

	summarize_output: output from summarize_via_tokenbatches()
	outpath: path to the output file
	add_signature: whether to add a signature to the output file
	kwargs: additional keyword arguments to include in the output file
	"""
	outpath = (
	Path.cwd() / f"document_summary_{get_timestamp()}.txt"
	if outpath is None
	else Path(outpath)
	)
	sum_text = [f"\t{s['summary'][0]}\n" for s in summarize_output]
	sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
	scores_text = "\n".join(sum_scores)
	full_summary = "\n".join(sum_text)

	with open(
	outpath,
	"w",
	encoding="utf-8",
	) as fo:
	fo.writelines(full_summary)
	fo.write("\n\n")
	if add_signature:
	fo.write("\n\n---\n\n")
	fo.write("Generated with the Document Summarization space :)\n\n")
	fo.write("https://hf.co/spaces/pszemraj/document-summarization\n\n")
	with open(
	outpath,
	"a",
	) as fo:
	fo.write("\n")
	fo.write(f"## Section Scores:\n\n")
	fo.writelines(scores_text)
	fo.write("\n\n")
	fo.write(f"Date: {get_timestamp()}\n\n")
	if kwargs:
	fo.write("---\n\n")
	fo.write("## Parameters:\n\n")
	for key, value in kwargs.items():
	fo.write(f"{key}: {value}\n")
	return outpath