import streamlit as st from haystack.document_stores import InMemoryDocumentStore from haystack.nodes import FARMReader, PreProcessor, PDFToTextConverter, TfidfRetriever from haystack.schema import Document import logging document_store = InMemoryDocumentStore() preprocessor = PreProcessor( clean_empty_lines=True, clean_whitespace=True, clean_header_footer=True, split_by="word", split_length=100, split_respect_sentence_boundary=True, split_overlap=3 ) def pdf_to_document_store(pdf_files): document_store.delete_documents() converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) documents = [Document.from_dict(converter.convert(file_path=pdf.name, meta=None) for pdf in pdf_files)] preprocessed_docs = preprocessor.process(documents) document_store.write_documents(preprocessed_docs) return None uploaded_files = st.file_uploader("Choose PDF files", accept_multiple_files=True) if uploaded_files is not None: pdf_to_document_store(uploaded_files) st.write(document_store.get_document_count)