File size: 1,964 Bytes
28ec4f0
a3fdd99
a7fa548
836e16d
a3fdd99
 
9a54394
 
 
 
 
 
 
5fdc2d5
cc0fbf1
9a54394
 
07607d7
9a54394
a3fdd99
 
 
 
6c152f9
 
5fdc2d5
 
 
 
 
5daf335
9c1fb8f
 
a7fa548
 
 
07607d7
a7fa548
9097656
9a54394
9097656
28ec4f0
a3fdd99
28ec4f0
bfb2bfb
a4300de
a7fa548
a4300de
3dfe2a3
 
f3a61e0
9097656
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, TfidfRetriever
from haystack.schema import Document
import logging

@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
    document_store = InMemoryDocumentStore()
    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=200,
        split_respect_sentence_boundary=True,
    )
    summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-newsroom")
    return document_store, summarizer, preprocessor


def pdf_to_document_store(pdf_files):
    converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
    documents = []
    for pdf in pdf_files:
        with open("temp-path.pdf", 'wb') as temp_file:
            temp_file.write(pdf)
            doc = converter.convert(file_path="temp-path.pdf", meta=None)[0]
            preprocessed_doc=preprocessor.process([doc])
            documents.append(preprocessed_doc)
    document_store.write_documents(documents)
    st.write('Document count: ', document_store.get_document_count())


def summarize(files):
    pdf_to_document_store(files)
    summary = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=False)
    st.write(summary)

document_store, summarizer, preprocessor = start_haystack()

uploaded_files = st.file_uploader("Choose PDF files", accept_multiple_files=True)

if uploaded_files is not None:
    st.write(len(uploaded_files))
    if st.button('Summarize Documents'):
        summarize(uploaded_files)

if st.button('Calculate num of docs'):
    st.write(document_store.get_document_count())

if st.button('Clear DocumentStore'):
    document_store.delete_all_documents()