|
from typing import Callable, Dict, List, Optional |
|
|
|
from pathlib import Path |
|
import re |
|
import logging |
|
import string |
|
import streamlit as st |
|
logger = logging.getLogger(__name__) |
|
|
|
import os |
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
from haystack.utils import convert_files_to_docs, fetch_archive_from_http |
|
from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter |
|
from haystack.nodes.file_converter import PDFToTextConverter, TextConverter |
|
from haystack.schema import Document |
|
import pdfplumber |
|
|
|
import pandas as pd |
|
|
|
import tempfile |
|
import sqlite3 |
|
|
|
|
|
|
|
def load_document( |
|
file_path: str, |
|
file_name, |
|
encoding: Optional[str] = None, |
|
id_hash_keys: Optional[List[str]] = None, |
|
) -> List[Document]: |
|
|
|
""" |
|
takes docx, txt and pdf files as input and \ |
|
extracts text as well as the filename as metadata. \ |
|
Since haystack does not take care of all pdf files, \ |
|
pdfplumber is attached to the pipeline in case the pdf \ |
|
extraction fails via Haystack. |
|
|
|
Returns a list of type haystack.schema.Document |
|
""" |
|
|
|
if file_name.endswith('.pdf'): |
|
converter = PDFToTextConverter(remove_numeric_tables=True) |
|
if file_name.endswith('.txt'): |
|
converter = TextConverter() |
|
if file_name.endswith('.docx'): |
|
converter = DocxToTextConverter() |
|
|
|
|
|
documents = [] |
|
logger.info("Converting {}".format(file_name)) |
|
|
|
|
|
document = converter.convert( |
|
file_path=file_path, meta=None, |
|
encoding=encoding, id_hash_keys=id_hash_keys |
|
)[0] |
|
text = document.content |
|
documents.append(Document(content=text, |
|
meta={"name": file_name}, |
|
id_hash_keys=id_hash_keys)) |
|
|
|
'''check if text is empty and apply different pdf processor. \ |
|
This can happen whith certain pdf types.''' |
|
for i in documents: |
|
if i.content == "": |
|
st.write("using pdfplumber") |
|
text = [] |
|
with pdfplumber.open(file_path) as pdf: |
|
for page in pdf.pages: |
|
text.append(page.extract_text()) |
|
i.content = ' '.join([page for page in text]) |
|
|
|
return documents |
|
|