Spaces:
Build error
Build error
File size: 1,573 Bytes
28ec4f0 a3fdd99 a7fa548 836e16d a3fdd99 a7fa548 a3fdd99 6c152f9 4d16c37 80419e0 4d16c37 a3fdd99 a7fa548 28ec4f0 a3fdd99 28ec4f0 bfb2bfb a4300de a7fa548 a4300de 3dfe2a3 f3a61e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, TfidfRetriever
from haystack.schema import Document
import logging
document_store = InMemoryDocumentStore()
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=100,
split_respect_sentence_boundary=True,
split_overlap=3
)
summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-xsum")
def pdf_to_document_store(pdf_files):
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
documents = []
for pdf in pdf_files:
st.write(pdf.name)
doc = converter.convert(file_path=pdf.name, meta=None)[0]
documents.append(doc)
st.write(len(documents))
preprocessed_docs = preprocessor.process(documents)
document_store.write_documents(preprocessed_docs)
return None
def summarize(files):
pdf_to_document_store(files)
summary = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
st.write(summary)
uploaded_files = st.file_uploader("Choose PDF files", accept_multiple_files=True)
if uploaded_files is not None:
st.write(len(uploaded_files))
if st.button('Summarize Documents'):
document_store.delete_documents()
summarize(uploaded_files)
if st.button('Calculate num of docs'):
st.write(document_store.get_document_count())
|