File size: 2,350 Bytes
22b8e0b 49a314a 22b8e0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
from typing import Callable, Dict, List, Optional
from pathlib import Path
import re
import logging
import string
import streamlit as st
logger = logging.getLogger(__name__)
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from haystack.utils import convert_files_to_docs, fetch_archive_from_http
from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter
from haystack.nodes.file_converter import PDFToTextConverter, TextConverter
from haystack.schema import Document
import pdfplumber
import pandas as pd
import tempfile
import sqlite3
def load_document(
file_path: str,
file_name,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
"""
takes docx, txt and pdf files as input and \
extracts text as well as the filename as metadata. \
Since haystack does not take care of all pdf files, \
pdfplumber is attached to the pipeline in case the pdf \
extraction fails via Haystack.
Returns a list of type haystack.schema.Document
"""
if file_name.endswith('.pdf'):
converter = PDFToTextConverter(remove_numeric_tables=True)
if file_name.endswith('.txt'):
converter = TextConverter()
if file_name.endswith('.docx'):
converter = DocxToTextConverter()
documents = []
logger.info("Converting {}".format(file_name))
# PDFToTextConverter, TextConverter, and DocxToTextConverter
# return a list containing a single Document
document = converter.convert(
file_path=file_path, meta=None,
encoding=encoding, id_hash_keys=id_hash_keys
)[0]
text = document.content
documents.append(Document(content=text,
meta={"name": file_name},
id_hash_keys=id_hash_keys))
'''check if text is empty and apply different pdf processor. \
This can happen whith certain pdf types.'''
for i in documents:
if i.content == "":
with st.spinner("using pdfplumber"):
text = []
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
text.append(page.extract_text())
i.content = ' '.join([page for page in text])
return documents
|