File size: 1,834 Bytes
28ec4f0
a3fdd99
6e57c67
836e16d
a3fdd99
f6cc0cb
 
a3fdd99
9a54394
 
 
 
 
 
 
5fdc2d5
cc0fbf1
9a54394
 
56d5448
9a54394
a3fdd99
 
3a4a956
 
a3fdd99
3a4a956
 
 
 
 
 
f6cc0cb
a7fa548
3a4a956
 
56d5448
eff1d2d
d42a71a
 
 
9a54394
9097656
3a4a956
a3fdd99
3a4a956
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter
from haystack.schema import Document
import logging
import base64


@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
    document_store = InMemoryDocumentStore()
    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=200,
        split_respect_sentence_boundary=True,
    )
    summarizer = TransformersSummarizer(model_name_or_path="facebook/bart-large-cnn")
    return document_store, summarizer, preprocessor


def pdf_to_document_store(pdf_file):
    document_store.delete_documents()
    converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
    with open("temp-path.pdf", 'wb') as temp_file:
        base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
        temp_file.write(base64.b64decode(base64_pdf))
        doc = converter.convert(file_path="temp-path.pdf", meta=None)
        preprocessed_docs=preprocessor.process(doc)
        document_store.write_documents(preprocessed_docs)
        temp_file.close()

def summarize(file):
    pdf_to_document_store(file)
    summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True, batch_size=5))
    st.write('Summary')
    for summary in summaries:
        st.write(summary.content)
        
document_store, summarizer, preprocessor = start_haystack()

uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)

if uploaded_file is not None:
    if st.button('Summarize Document'):
        summarize(uploaded_file)