import streamlit as st from haystack.document_stores import InMemoryDocumentStore from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, TfidfRetriever from haystack.schema import Document import logging document_store = InMemoryDocumentStore() preprocessor = PreProcessor( clean_empty_lines=True, clean_whitespace=True, clean_header_footer=True, split_by="word", split_length=100, split_respect_sentence_boundary=True, split_overlap=3 ) summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-xsum") def pdf_to_document_store(pdf_files): converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) documents = [] for pdf in pdf_files: st.write(pdf.name) doc = converter.convert(file_path=pdf.name, meta=None)[0] documents.append(doc) st.write(len(documents)) preprocessed_docs = preprocessor.process(documents) document_store.write_documents(preprocessed_docs) def summarize(files): pdf_to_document_store(files) summary = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True) st.write(summary) uploaded_files = st.file_uploader("Choose PDF files", accept_multiple_files=True) if uploaded_files is not None: st.write(len(uploaded_files)) if st.button('Summarize Documents'): summarize(uploaded_files) if st.button('Calculate num of docs'): st.write(document_store.get_document_count()) if st.button('Clear DocumentStore'): document_store.delete_all_documents()