workaround for spellcheck fail
Browse files- pdf2text.py +23 -20
pdf2text.py
CHANGED
@@ -213,26 +213,29 @@ def eval_and_replace(text: str, match_token: str = "- ") -> str:
|
|
213 |
str: text with replaced tokens
|
214 |
"""
|
215 |
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
|
|
|
|
|
|
236 |
|
237 |
|
238 |
def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
|
|
|
213 |
str: text with replaced tokens
|
214 |
"""
|
215 |
|
216 |
+
try:
|
217 |
+
if match_token not in text:
|
218 |
+
return text
|
219 |
+
else:
|
220 |
+
while True:
|
221 |
+
full_before_text = text.split(match_token, maxsplit=1)[0]
|
222 |
+
before_text = [
|
223 |
+
char for char in full_before_text.split()[-1] if char.isalpha()
|
224 |
+
]
|
225 |
+
before_text = "".join(before_text)
|
226 |
+
full_after_text = text.split(match_token, maxsplit=1)[-1]
|
227 |
+
after_text = [char for char in full_after_text.split()[0] if char.isalpha()]
|
228 |
+
after_text = "".join(after_text)
|
229 |
+
full_text = before_text + after_text
|
230 |
+
if check_word_spelling(full_text):
|
231 |
+
text = full_before_text + full_after_text
|
232 |
+
else:
|
233 |
+
text = full_before_text + " " + full_after_text
|
234 |
+
if match_token not in text:
|
235 |
+
break
|
236 |
+
except Exception as e:
|
237 |
+
logging.error(f"Error spell-checking OCR output, returning default text:\t{e}")
|
238 |
+
return text
|
239 |
|
240 |
|
241 |
def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
|