Tuana commited on
Commit
cd8e155
1 Parent(s): 9b93f04

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -14,7 +14,6 @@ def start_haystack():
14
  split_by="word",
15
  split_length=100,
16
  split_respect_sentence_boundary=True,
17
- split_overlap=0
18
  )
19
  summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-newsroom")
20
  return document_store, summarizer, preprocessor
@@ -24,9 +23,10 @@ def pdf_to_document_store(pdf_files):
24
  converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
25
  documents = []
26
  for pdf in pdf_files:
27
- doc = converter.convert(file_path=pdf.name, meta=None)
 
28
  documents.append(doc)
29
- document_store.write_documents(documents)
30
  st.write('Document count: ', document_store.get_document_count())
31
 
32
 
 
14
  split_by="word",
15
  split_length=100,
16
  split_respect_sentence_boundary=True,
 
17
  )
18
  summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-newsroom")
19
  return document_store, summarizer, preprocessor
 
23
  converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
24
  documents = []
25
  for pdf in pdf_files:
26
+ doc = converter.convert(file_path=pdf.name, meta=None)[0]
27
+ preprocessed_doc=preprocessor.process([doc])
28
  documents.append(doc)
29
+ document_store.write_documents(preprocessed_doc)
30
  st.write('Document count: ', document_store.get_document_count())
31
 
32