File size: 1,892 Bytes
28ec4f0
a3fdd99
6e57c67
836e16d
a3fdd99
f6cc0cb
 
a3fdd99
9a54394
 
 
 
 
 
 
5fdc2d5
cc0fbf1
9a54394
 
56d5448
9a54394
a3fdd99
 
3a4a956
 
a3fdd99
3a4a956
 
 
 
 
 
f6cc0cb
a7fa548
3a4a956
 
cc5b5c1
2d4dc51
eff1d2d
d42a71a
 
 
9a54394
9097656
3a4a956
a3fdd99
3a4a956
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter
from haystack.schema import Document
import logging
import base64


@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
    document_store = InMemoryDocumentStore()
    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=200,
        split_respect_sentence_boundary=True,
    )
    summarizer = TransformersSummarizer(model_name_or_path="facebook/bart-large-cnn")
    return document_store, summarizer, preprocessor


def pdf_to_document_store(pdf_file):
    document_store.delete_documents()
    converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
    with open("temp-path.pdf", 'wb') as temp_file:
        base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
        temp_file.write(base64.b64decode(base64_pdf))
        doc = converter.convert(file_path="temp-path.pdf", meta=None)
        preprocessed_docs=preprocessor.process(doc)
        document_store.write_documents(preprocessed_docs)
        temp_file.close()

def summarize(file):
    pdf_to_document_store(file)
    st.write('Number of documents', document_store.get_document_count())
    summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
    st.write('Summary')
    for summary in summaries:
        st.write(summary.content)
        
document_store, summarizer, preprocessor = start_haystack()

uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)

if uploaded_file is not None:
    if st.button('Summarize Document'):
        summarize(uploaded_file)