Update utils/preprocessing.py
Browse files- utils/preprocessing.py +36 -20
utils/preprocessing.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
from haystack.nodes.base import BaseComponent
|
2 |
from haystack.schema import Document
|
3 |
-
from haystack.nodes import
|
4 |
from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
|
|
|
5 |
from typing import Callable, Dict, List, Optional, Text, Tuple, Union
|
6 |
from typing_extensions import Literal
|
7 |
import pandas as pd
|
@@ -9,7 +10,9 @@ import logging
|
|
9 |
import re
|
10 |
import string
|
11 |
from haystack.pipelines import Pipeline
|
|
|
12 |
|
|
|
13 |
def useOCR(file_path: str)-> Text:
|
14 |
"""
|
15 |
Converts image pdfs into text, Using the Farm-haystack[OCR]
|
@@ -21,13 +24,30 @@ def useOCR(file_path: str)-> Text:
|
|
21 |
|
22 |
Returns the text file as string.
|
23 |
"""
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
converter =
|
27 |
valid_languages=["eng"])
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
30 |
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
|
33 |
|
@@ -37,13 +57,10 @@ class FileConverter(BaseComponent):
|
|
37 |
Converter class, will use internally haystack PDFToTextOCR in case of image
|
38 |
pdf. Cannot use the FileClassifier from haystack as its doesnt has any
|
39 |
label/output class for image.
|
40 |
-
|
41 |
1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
|
42 |
2. https://docs.haystack.deepset.ai/docs/file_converters
|
43 |
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
|
44 |
4. https://docs.haystack.deepset.ai/reference/file-converters-api
|
45 |
-
|
46 |
-
|
47 |
"""
|
48 |
|
49 |
outgoing_edges = 1
|
@@ -84,8 +101,6 @@ class FileConverter(BaseComponent):
|
|
84 |
|
85 |
documents = []
|
86 |
|
87 |
-
|
88 |
-
# encoding is empty, probably should be utf-8
|
89 |
document = converter.convert(
|
90 |
file_path=file_path, meta=None,
|
91 |
encoding=encoding, id_hash_keys=id_hash_keys
|
@@ -101,10 +116,12 @@ class FileConverter(BaseComponent):
|
|
101 |
if filtered == "":
|
102 |
logging.info("Using OCR")
|
103 |
text = useOCR(file_path)
|
104 |
-
|
105 |
documents.append(Document(content=text,
|
106 |
meta={"name": file_name},
|
107 |
id_hash_keys=id_hash_keys))
|
|
|
|
|
108 |
|
109 |
logging.info('file conversion succesful')
|
110 |
output = {'documents': documents}
|
@@ -124,7 +141,6 @@ def basic(s:str, remove_punc:bool = False):
|
|
124 |
|
125 |
"""
|
126 |
Performs basic cleaning of text.
|
127 |
-
|
128 |
Params
|
129 |
----------
|
130 |
s: string to be processed
|
@@ -150,6 +166,7 @@ def basic(s:str, remove_punc:bool = False):
|
|
150 |
|
151 |
return s.strip()
|
152 |
|
|
|
153 |
def paraLengthCheck(paraList, max_len = 100):
|
154 |
"""
|
155 |
There are cases where preprocessor cannot respect word limit, when using
|
@@ -187,15 +204,13 @@ class UdfPreProcessor(BaseComponent):
|
|
187 |
class to preprocess the document returned by FileConverter. It will check
|
188 |
for splitting strategy and splits the document by word or sentences and then
|
189 |
synthetically create the paragraphs.
|
190 |
-
|
191 |
1. https://docs.haystack.deepset.ai/docs/preprocessor
|
192 |
2. https://docs.haystack.deepset.ai/reference/preprocessor-api
|
193 |
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
|
194 |
-
|
195 |
"""
|
196 |
outgoing_edges = 1
|
197 |
|
198 |
-
def run(self, documents:List[Document], remove_punc:bool=False,
|
199 |
split_by: Literal["sentence", "word"] = 'sentence',
|
200 |
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
201 |
split_overlap:int = 0):
|
@@ -250,8 +265,11 @@ class UdfPreProcessor(BaseComponent):
|
|
250 |
# # basic cleaning before passing it to preprocessor.
|
251 |
# i = basic(i)
|
252 |
docs_processed = preprocessor.process([i])
|
253 |
-
|
254 |
-
|
|
|
|
|
|
|
255 |
|
256 |
df = pd.DataFrame(docs_processed)
|
257 |
all_text = " ".join(df.content.to_list())
|
@@ -275,7 +293,6 @@ def processingpipeline():
|
|
275 |
"""
|
276 |
Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
|
277 |
from utils.preprocessing
|
278 |
-
|
279 |
"""
|
280 |
|
281 |
preprocessing_pipeline = Pipeline()
|
@@ -287,5 +304,4 @@ def processingpipeline():
|
|
287 |
preprocessing_pipeline.add_node(component = custom_preprocessor,
|
288 |
name ='UdfPreProcessor', inputs=["FileConverter"])
|
289 |
|
290 |
-
return preprocessing_pipeline
|
291 |
-
|
|
|
1 |
from haystack.nodes.base import BaseComponent
|
2 |
from haystack.schema import Document
|
3 |
+
from haystack.nodes import ImageToTextConverter, PDFToTextConverter
|
4 |
from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
|
5 |
+
from pdf2image import convert_from_path
|
6 |
from typing import Callable, Dict, List, Optional, Text, Tuple, Union
|
7 |
from typing_extensions import Literal
|
8 |
import pandas as pd
|
|
|
10 |
import re
|
11 |
import string
|
12 |
from haystack.pipelines import Pipeline
|
13 |
+
import streamlit as st
|
14 |
|
15 |
+
@st.cache_data
|
16 |
def useOCR(file_path: str)-> Text:
|
17 |
"""
|
18 |
Converts image pdfs into text, Using the Farm-haystack[OCR]
|
|
|
24 |
|
25 |
Returns the text file as string.
|
26 |
"""
|
27 |
+
# we need pdf file to be first converted into image file
|
28 |
+
# this will create each page as image file
|
29 |
+
images = convert_from_path(pdf_path = file_path)
|
30 |
+
list_ = []
|
31 |
+
# save image file in cache and read them one by one to pass it to OCR
|
32 |
+
for i, pdf in enumerate(images):
|
33 |
+
# Save pages as images in the pdf
|
34 |
+
pdf.save(f'PDF\image_converted_{i+1}.png', 'PNG')
|
35 |
+
list_.append(f'PDF\image_converted_{i+1}.png')
|
36 |
|
37 |
+
converter = ImageToTextConverter(remove_numeric_tables=True,
|
38 |
valid_languages=["eng"])
|
39 |
+
# placeholder to collect the text from each page
|
40 |
+
placeholder = []
|
41 |
+
for file in list_:
|
42 |
+
document = converter.convert(
|
43 |
+
file_path=file, meta=None,
|
44 |
+
)[0]
|
45 |
|
46 |
+
text = document.content
|
47 |
+
placeholder.append(text)
|
48 |
+
# join the text from each page by page separator
|
49 |
+
text = '\x0c'.join(placeholder)
|
50 |
+
return text
|
51 |
|
52 |
|
53 |
|
|
|
57 |
Converter class, will use internally haystack PDFToTextOCR in case of image
|
58 |
pdf. Cannot use the FileClassifier from haystack as its doesnt has any
|
59 |
label/output class for image.
|
|
|
60 |
1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
|
61 |
2. https://docs.haystack.deepset.ai/docs/file_converters
|
62 |
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
|
63 |
4. https://docs.haystack.deepset.ai/reference/file-converters-api
|
|
|
|
|
64 |
"""
|
65 |
|
66 |
outgoing_edges = 1
|
|
|
101 |
|
102 |
documents = []
|
103 |
|
|
|
|
|
104 |
document = converter.convert(
|
105 |
file_path=file_path, meta=None,
|
106 |
encoding=encoding, id_hash_keys=id_hash_keys
|
|
|
116 |
if filtered == "":
|
117 |
logging.info("Using OCR")
|
118 |
text = useOCR(file_path)
|
119 |
+
|
120 |
documents.append(Document(content=text,
|
121 |
meta={"name": file_name},
|
122 |
id_hash_keys=id_hash_keys))
|
123 |
+
|
124 |
+
|
125 |
|
126 |
logging.info('file conversion succesful')
|
127 |
output = {'documents': documents}
|
|
|
141 |
|
142 |
"""
|
143 |
Performs basic cleaning of text.
|
|
|
144 |
Params
|
145 |
----------
|
146 |
s: string to be processed
|
|
|
166 |
|
167 |
return s.strip()
|
168 |
|
169 |
+
|
170 |
def paraLengthCheck(paraList, max_len = 100):
|
171 |
"""
|
172 |
There are cases where preprocessor cannot respect word limit, when using
|
|
|
204 |
class to preprocess the document returned by FileConverter. It will check
|
205 |
for splitting strategy and splits the document by word or sentences and then
|
206 |
synthetically create the paragraphs.
|
|
|
207 |
1. https://docs.haystack.deepset.ai/docs/preprocessor
|
208 |
2. https://docs.haystack.deepset.ai/reference/preprocessor-api
|
209 |
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
|
|
|
210 |
"""
|
211 |
outgoing_edges = 1
|
212 |
|
213 |
+
def run(self, documents:List[Document], remove_punc:bool=False, apply_clean = True,
|
214 |
split_by: Literal["sentence", "word"] = 'sentence',
|
215 |
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
216 |
split_overlap:int = 0):
|
|
|
265 |
# # basic cleaning before passing it to preprocessor.
|
266 |
# i = basic(i)
|
267 |
docs_processed = preprocessor.process([i])
|
268 |
+
if apply_clean:
|
269 |
+
for item in docs_processed:
|
270 |
+
item.content = basic(item.content, remove_punc= remove_punc)
|
271 |
+
else:
|
272 |
+
pass
|
273 |
|
274 |
df = pd.DataFrame(docs_processed)
|
275 |
all_text = " ".join(df.content.to_list())
|
|
|
293 |
"""
|
294 |
Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
|
295 |
from utils.preprocessing
|
|
|
296 |
"""
|
297 |
|
298 |
preprocessing_pipeline = Pipeline()
|
|
|
304 |
preprocessing_pipeline.add_node(component = custom_preprocessor,
|
305 |
name ='UdfPreProcessor', inputs=["FileConverter"])
|
306 |
|
307 |
+
return preprocessing_pipeline
|
|