File size: 1,991 Bytes
28ec4f0
a3fdd99
a7fa548
836e16d
a3fdd99
 
9a54394
 
 
 
 
 
 
 
 
 
2a5639d
9a54394
 
 
a3fdd99
 
 
 
6c152f9
 
4d16c37
80419e0
 
4d16c37
a3fdd99
2a5639d
a3fdd99
9c1fb8f
 
a7fa548
 
 
 
 
9097656
9a54394
9097656
28ec4f0
a3fdd99
28ec4f0
bfb2bfb
a4300de
a7fa548
a4300de
3dfe2a3
 
f3a61e0
9097656
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, TfidfRetriever
from haystack.schema import Document
import logging

@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
    document_store = InMemoryDocumentStore()
    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=100,
        split_respect_sentence_boundary=True,
        split_overlap=0
    )
    summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-xsum")
    return document_store, summarizer, preprocessor


def pdf_to_document_store(pdf_files):
    converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
    documents = []
    for pdf in pdf_files:
        st.write(pdf.name)
        doc = converter.convert(file_path=pdf.name, meta=None)[0]
        documents.append(doc)
    st.write(len(documents))
    preprocessed_docs = preprocessor.process(documents)
    st.write('Preprocessed count: ', len(preprocessed_docs))
    document_store.write_documents(preprocessed_docs)
    st.write('Document count: ', document_store.get_document_count())


def summarize(files):
    pdf_to_document_store(files)
    summary = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
    st.write(summary)

document_store, summarizer, preprocessor = start_haystack()

uploaded_files = st.file_uploader("Choose PDF files", accept_multiple_files=True)

if uploaded_files is not None:
    st.write(len(uploaded_files))
    if st.button('Summarize Documents'):
        summarize(uploaded_files)

if st.button('Calculate num of docs'):
    st.write(document_store.get_document_count())

if st.button('Clear DocumentStore'):
    document_store.delete_all_documents()