File size: 3,003 Bytes
28ec4f0
a3fdd99
6e57c67
836e16d
a3fdd99
f6cc0cb
 
a3fdd99
9a54394
 
 
 
 
 
 
5fdc2d5
cc0fbf1
9a54394
 
58c1223
9a54394
a3fdd99
 
3a4a956
 
a3fdd99
3a4a956
 
 
 
 
 
f6cc0cb
a7fa548
3a4a956
 
2d4dc51
67f4a7d
 
 
 
 
d42a71a
67f4a7d
9a54394
9097656
762970d
 
fe7b517
c1986cc
fe7b517
 
3a4a956
a3fdd99
3a4a956
 
67f4a7d
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter
from haystack.schema import Document
import logging
import base64


@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
    document_store = InMemoryDocumentStore()
    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=200,
        split_respect_sentence_boundary=True,
    )
    summarizer = TransformersSummarizer(model_name_or_path="facebook/bart-large-cnn")
    return document_store, summarizer, preprocessor


def pdf_to_document_store(pdf_file):
    document_store.delete_documents()
    converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
    with open("temp-path.pdf", 'wb') as temp_file:
        base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
        temp_file.write(base64.b64decode(base64_pdf))
        doc = converter.convert(file_path="temp-path.pdf", meta=None)
        preprocessed_docs=preprocessor.process(doc)
        document_store.write_documents(preprocessed_docs)
        temp_file.close()

def summarize(file):
    pdf_to_document_store(file)
    summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
    return summaries

def set_state_if_absent(key, value):
    if key not in st.session_state:
        st.session_state[key] = value
        
set_state_if_absent("summaries", None)        
document_store, summarizer, preprocessor = start_haystack()

st.title('TL;DR with Haystack')

st.markdown( """
This Summarization demo uses a [Haystack TransformerSummarizer node](https://haystack.deepset.ai/pipeline_nodes/summarizer). You can upload a PDF file, which will be converted to text with the [Haystack PDFtoTextConverter](https://haystack.deepset.ai/reference/file-converters#pdftotextconverter). In this demo, we produce 1 summary for the whole file you upload. So, the TransformerSummarizer treats the whole thing as one string, which means along with the model limitations, PDFs that have a lot of unneeded text at the beginning produce poor results. For best results, upload a document that has minimal intro and tables at the top. 
""", unsafe_allow_html=True)

uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)

if uploaded_file is not None:
    if st.button('Summarize Document'):
        with st.spinner("📚    Please wait while we produce a summary..."):
            try:
                st. session_state.summaries = summarize(uploaded_file)
            except Exception as e:
                logging.exception(e)
 
if st.session_state.summaries:
    st.write('## Summary')
    for count, summary in enumerate(st.session_state.summaries):
        st.write(summary.content)