pdf-ocr

Sleeping

pszemraj commited on Jan 29, 2023

Commit

6585180

•

1 Parent(s): 23f1fc4

workaround for spellcheck fail

Files changed (1) hide show

pdf2text.py CHANGED Viewed

@@ -213,26 +213,29 @@ def eval_and_replace(text: str, match_token: str = "- ") -> str:
         str:  text with replaced tokens
     """
-    if match_token not in text:
-        return text
-    else:
-        while True:
-            full_before_text = text.split(match_token, maxsplit=1)[0]
-            before_text = [
-                char for char in full_before_text.split()[-1] if char.isalpha()
-            ]
-            before_text = "".join(before_text)
-            full_after_text = text.split(match_token, maxsplit=1)[-1]
-            after_text = [char for char in full_after_text.split()[0] if char.isalpha()]
-            after_text = "".join(after_text)
-            full_text = before_text + after_text
-            if check_word_spelling(full_text):
-                text = full_before_text + full_after_text
-            else:
-                text = full_before_text + " " + full_after_text
-            if match_token not in text:
-                break
-        return text
 def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:

         str:  text with replaced tokens
     """
+    try:
+        if match_token not in text:
+            return text
+        else:
+            while True:
+                full_before_text = text.split(match_token, maxsplit=1)[0]
+                before_text = [
+                    char for char in full_before_text.split()[-1] if char.isalpha()
+                ]
+                before_text = "".join(before_text)
+                full_after_text = text.split(match_token, maxsplit=1)[-1]
+                after_text = [char for char in full_after_text.split()[0] if char.isalpha()]
+                after_text = "".join(after_text)
+                full_text = before_text + after_text
+                if check_word_spelling(full_text):
+                    text = full_before_text + full_after_text
+                else:
+                    text = full_before_text + " " + full_after_text
+                if match_token not in text:
+                    break
+    except Exception as e:
+        logging.error(f"Error spell-checking OCR output, returning default text:\t{e}")
+    return text
 def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str: