File size: 1,590 Bytes
28ec4f0
a3fdd99
a7fa548
836e16d
a3fdd99
 
 
 
 
 
 
 
 
 
 
 
a7fa548
a3fdd99
 
 
 
6c152f9
 
4d16c37
80419e0
 
4d16c37
a3fdd99
 
a7fa548
 
 
 
 
9097656
 
28ec4f0
a3fdd99
28ec4f0
bfb2bfb
a4300de
a7fa548
a4300de
3dfe2a3
 
f3a61e0
9097656
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, TfidfRetriever
from haystack.schema import Document
import logging

document_store = InMemoryDocumentStore()
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
    split_overlap=3
)
summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-xsum")


def pdf_to_document_store(pdf_files):
    converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
    documents = []
    for pdf in pdf_files:
        st.write(pdf.name)
        doc = converter.convert(file_path=pdf.name, meta=None)[0]
        documents.append(doc)
    st.write(len(documents))
    preprocessed_docs = preprocessor.process(documents)
    document_store.write_documents(preprocessed_docs)

def summarize(files):
    pdf_to_document_store(files)
    summary = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
    st.write(summary)


uploaded_files = st.file_uploader("Choose PDF files", accept_multiple_files=True)

if uploaded_files is not None:
    st.write(len(uploaded_files))
    if st.button('Summarize Documents'):
        summarize(uploaded_files)

if st.button('Calculate num of docs'):
    st.write(document_store.get_document_count())

if st.button('Clear DocumentStore'):
    document_store.delete_all_documents()